Exemplo n.º 1
0
    def _test_broadcast_helper(self, group, group_id, rank, cuda=False, rank_to_GPU=None):
        for ttype, value, requires_cuda in [
            ('torch.FloatTensor', -1e-10, False),
            ('torch.DoubleTensor', -1e-100, False),
            ('torch.HalfTensor', -0.1, True),
            ('torch.CharTensor', -2, False),
            ('torch.ByteTensor', 129, False),
            ('torch.IntTensor', -1e5, False),
            ('torch.LongTensor', -1e15, False),
        ]:
            if requires_cuda and not cuda:
                continue
            for src in group:
                expected_tensor = _build_tensor(src + 1, value).type(ttype)
                if cuda:
                    expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0])
                if rank == src:
                    dist.broadcast(expected_tensor, src, group_id)
                else:
                    tensor = _build_tensor(src + 1, -1).type(ttype)
                    if cuda:
                        tensor = tensor.cuda(rank_to_GPU[rank][0])
                    dist.broadcast(tensor, src, group_id)
                    self.assertEqual(tensor.size(), expected_tensor.size())
                    self.assertEqual(tensor.ne(expected_tensor).max(), 0)

        self._barrier()
Exemplo n.º 2
0
    def consistent_indices(self, rank, indices, shuffle):
        """ synchronize indices among workers. """
        if rank == 0 and shuffle:
            random.shuffle(indices)

        # broadcast.
        indices = torch.IntTensor(indices)
        dist.broadcast(indices, src=0)
        return list(indices)
Exemplo n.º 3
0
    def _test_broadcast_helper(self, group, group_id, rank, cuda=False):
        for src in group:
            expected_tensor = _build_tensor(src + 1)
            if cuda:
                expected_tensor = expected_tensor.cuda()
            if rank == src:
                dist.broadcast(expected_tensor, src, group_id)
            else:
                tensor = _build_tensor(src + 1, -1)
                if cuda:
                    tensor = tensor.cuda()
                dist.broadcast(tensor, src, group_id)
                self.assertEqual(tensor, expected_tensor)

        self._barrier()
Exemplo n.º 4
0
    def _dist_broadcast_coalesced(self, tensors, buffer_size):
        """
        Broadcast a sequence of tensors to the default group from rank 0.
        Small tensors are first coalesced into a buffer to reduce the number of
        broadcasts.

        tensors (sequence): tensors to broadcast. Each tensor needs to be on the
                            same GPU.
        buffer_size (int): maximum size of the buffer for coalescing
        """
        for tensors in _take_tensors(tensors, buffer_size):
            flat_tensors = _flatten_dense_tensors(tensors)
            dist.broadcast(flat_tensors, 0)
            for tensor, synced in zip(tensors,
                                      _unflatten_dense_tensors(flat_tensors, tensors)):
                tensor.copy_(synced)
Exemplo n.º 5
0
    def _test_barrier_helper(self, group, group_id, rank):
        WAIT_TIME = 0.3  # seconds

        for dest in group:
            expected_time = torch.DoubleTensor(1).fill_(0.0)
            if dest == rank:
                expected_time.fill_(time.time() + WAIT_TIME)
                dist.broadcast(expected_time, dest, group_id)
                time.sleep(WAIT_TIME + 0.1)  # sleep a little bit longer
                dist.barrier(group_id)
            else:
                dist.broadcast(expected_time, dest, group_id)
                dist.barrier(group_id)
                self.assertGreaterEqual(time.time(), expected_time[0])

        self._barrier()
Exemplo n.º 6
0
    def __init__(self, module):
        super(DistributedDataParallel, self).__init__()
        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False

        self.module = module

        for p in self.module.state_dict().values():
            if not torch.is_tensor(p):
                continue
            if dist._backend == dist.dist_backend.NCCL:
                assert p.is_cuda, "NCCL backend only supports model parameters to be on GPU."
            dist.broadcast(p, 0)

        def allreduce_params():
            if (self.needs_reduction):
                self.needs_reduction = False
                buckets = {}
                for param in self.module.parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        if tp not in buckets:
                            buckets[tp] = []
                        buckets[tp].append(param)
                if self.warn_on_half:
                    if torch.cuda.HalfTensor in buckets:
                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
                              " It is recommended to use the NCCL backend in this case.")
                        self.warn_on_half = False

                for tp in buckets:
                    bucket = buckets[tp]
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)

        for param in list(self.module.parameters()):
            def allreduce_hook(*unused):
                param._execution_engine.queue_callback(allreduce_params)

            if param.requires_grad:
                param.register_hook(allreduce_hook)
Exemplo n.º 7
0
    def _sync_params(self):
        params = [p.data for p in self.module.parameters()]
        result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size)
        for tensors, module in zip(result[1:], self._module_copies[1:]):
            for tensor, param in zip(tensors, module.parameters()):
                param.data.set_(tensor)

        # cross-node buffer sync
        buffers = list(self.module._all_buffers())
        flat_buffers = _flatten_tensors(buffers)
        dist.broadcast(flat_buffers, 0)
        for buf, synced in zip(buffers, _unflatten_tensors(flat_buffers, buffers)):
            buf.copy_(synced)

        # intra-node buffer sync
        result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size)
        for tensors, module in zip(result[1:], self._module_copies[1:]):
            for tensor, buf in zip(tensors, module._all_buffers()):
                buf.set_(tensor)
Exemplo n.º 8
0
def broadcast_params(model):
    """ broadcast model parameters """
    for p in model.state_dict().values():
        dist.broadcast(p, 0)
Exemplo n.º 9
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:
        def print_pass(*args):
            pass

        # builtins.print = print_pass

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.arch))
    model = pcl.builder.MoCo(
        models.__dict__[args.arch],
        args.low_dim, args.pcl_r, args.moco_m, args.temperature, args.mlp)
    print(model)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
        # comment out the following line for debugging
        raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        raise NotImplementedError("Only DistributedDataParallel is supported.")

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    train_dataset, eval_dataset = create_cifar10_dataset(args)

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
        eval_sampler = torch.utils.data.distributed.DistributedSampler(eval_dataset, shuffle=False)
    else:
        train_sampler = None
        eval_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)

    # dataloader for center-cropped images, use larger batch size to increase speed
    eval_loader = torch.utils.data.DataLoader(
        eval_dataset, batch_size=args.batch_size * 5, shuffle=False,
        sampler=eval_sampler, num_workers=args.workers, pin_memory=True
    )

    for epoch in range(args.start_epoch, args.epochs):

        cluster_result = None
        if epoch >= args.warmup_epoch:
            # compute momentum features for center-cropped images
            features = compute_features(eval_loader, model, args)

            # placeholder for clustering result
            cluster_result = {'im2cluster': [], 'centroids': [], 'density': []}
            for num_cluster in args.num_cluster:
                cluster_result['im2cluster'].append(torch.zeros(len(eval_dataset), dtype=torch.long).cuda())
                cluster_result['centroids'].append(torch.zeros(int(num_cluster), args.low_dim).cuda())
                cluster_result['density'].append(torch.zeros(int(num_cluster)).cuda())

            if args.gpu == 0:
                features[
                    torch.norm(features, dim=1) > 1.5] /= 2  # account for the few samples that are computed twice
                features = features.numpy()
                cluster_result = run_kmeans(features, args)  # run kmeans clustering on master node
                # save the clustering result
                # torch.save(cluster_result,os.path.join(args.exp_dir, 'clusters_%d'%epoch))  

            dist.barrier()
            # broadcast clustering result
            for k, data_list in cluster_result.items():
                for data_tensor in data_list:
                    dist.broadcast(data_tensor, 0, async_op=False)

        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args, cluster_result)

        if (epoch + 1) % 5 == 0 and (not args.multiprocessing_distributed or (args.multiprocessing_distributed
                                                                              and args.rank % ngpus_per_node == 0)):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }, is_best=False, filename='{}/checkpoint_{:04d}.pth.tar'.format(args.exp_dir, epoch))
Exemplo n.º 10
0
MAX_NUM_TENSORS = args.max_num_tensors + 1
MAX_BYTES = args.max_bytes + 1

dist.init_process_group(backend=os.environ['BACKEND'])

rank = dist.get_rank()
dist.barrier()

if rank == 0:
    print_header("broadcast")
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            start = timer()
            for i in range(0, num_tensors):
                dist.broadcast(tensor, 0)
            end = timer()
            print_stats(bytes, num_tensors, end - start)
    print()
else:
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            for i in range(0, num_tensors):
                dist.broadcast(tensor, 0)
dist.barrier()

if rank == 0:
    print_header("send from 0 to 1")
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
Exemplo n.º 11
0
def train(hyp, opt, device, tb_writer=None):
    logger.info(f'Hyperparameters {hyp}')
    log_dir = Path(tb_writer.log_dir) if tb_writer else Path(
        opt.logdir) / 'evolve'  # logging directory
    wdir = log_dir / 'weights'  # weights directory
    os.makedirs(wdir, exist_ok=True)
    last = wdir / 'last.pt'
    best = wdir / 'best.pt'
    results_file = str(log_dir / 'results.txt')
    epochs, batch_size, total_batch_size, weights, rank = \
        opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank

    # Save run settings
    with open(log_dir / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(log_dir / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)

    # Configure
    cuda = device.type != 'cpu'
    init_seeds(2 + rank)
    with open(opt.data, 'r+', encoding="utf-8") as f:
        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # data dict
    with torch_distributed_zero_first(rank):
        check_dataset(data_dict)  # check
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc, names = (1, ['item']) if opt.single_cls else (int(
        data_dict['nc']), data_dict['names'])  # number classes, names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (
        len(names), nc, opt.data)  # check

    # Model
    pretrained = weights.endswith('.pt')
    if pretrained:
        with torch_distributed_zero_first(rank):
            attempt_download(weights)  # download if not found locally
        ckpt = torch.load(weights, map_location=device)  # load checkpoint
        if hyp.get('anchors'):
            ckpt['model'].yaml['anchors'] = round(
                hyp['anchors'])  # force autoanchor
        model = Model(opt.cfg or ckpt['model'].yaml, ch=3,
                      nc=nc).to(device)  # create
        exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [
        ]  # exclude keys
        state_dict = ckpt['model'].float().state_dict()  # to FP32
        state_dict = intersect_dicts(state_dict,
                                     model.state_dict(),
                                     exclude=exclude)  # intersect
        model.load_state_dict(state_dict, strict=False)  # load
        logger.info(
            'Transferred %g/%g items from %s' %
            (len(state_dict), len(model.state_dict()), weights))  # report
    else:
        model = Model(opt.cfg, ch=3, nc=nc).to(device)  # create

    # Freeze
    freeze = [
        '',
    ]  # parameter names to freeze (full or partial)
    if any(freeze):
        for k, v in model.named_parameters():
            if any(x in k for x in freeze):
                print('freezing %s' % k)
                v.requires_grad = False

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / total_batch_size),
                     1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_parameters():
        v.requires_grad = True
        if '.bias' in k:
            pg2.append(v)  # biases
        elif '.weight' in k and '.bn' not in k:
            pg1.append(v)  # apply weight decay
        else:
            pg0.append(v)  # all else

    if opt.adam:
        optimizer = optim.Adam(pg0,
                               lr=hyp['lr0'],
                               betas=(hyp['momentum'],
                                      0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp['lr0'],
                              momentum=hyp['momentum'],
                              nesterov=True)

    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' %
                (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
    lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[
        'lrf']) + hyp['lrf']  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # Resume
    start_epoch, best_fitness = 0, 0.0
    if pretrained:
        # Optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # Results
        if ckpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(ckpt['training_results'])  # write results.txt

        # Epochs
        start_epoch = ckpt['epoch'] + 1
        if opt.resume:
            assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (
                weights, epochs)
            shutil.copytree(wdir, wdir.parent /
                            f'weights_backup_epoch{start_epoch - 1}'
                            )  # save previous weights
        if epochs < start_epoch:
            logger.info(
                '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.'
                % (weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt, state_dict

    # Image sizes
    gs = int(max(model.stride))  # grid size (max stride)
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size
                         ]  # verify imgsz are gs-multiples

    # DP mode
    if cuda and rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and cuda and rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        logger.info('Using SyncBatchNorm()')

    # Exponential moving average
    ema = ModelEMA(model) if rank in [-1, 0] else None

    # DDP mode
    if cuda and rank != -1:
        model = DDP(model,
                    device_ids=[opt.local_rank],
                    output_device=opt.local_rank)

    # Trainloader
    dataloader, dataset = create_dataloader(train_path,
                                            imgsz,
                                            batch_size,
                                            gs,
                                            opt,
                                            hyp=hyp,
                                            augment=True,
                                            cache=opt.cache_images,
                                            rect=opt.rect,
                                            rank=rank,
                                            world_size=opt.world_size,
                                            workers=opt.workers)
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (
        mlc, nc, opt.data, nc - 1)

    # Process 0
    if rank in [-1, 0]:
        ema.updates = start_epoch * nb // accumulate  # set EMA updates
        testloader = create_dataloader(test_path,
                                       imgsz_test,
                                       total_batch_size * 2,
                                       gs,
                                       opt,
                                       hyp=hyp,
                                       augment=False,
                                       cache=opt.cache_images
                                       and not opt.notest,
                                       rect=True,
                                       rank=-1,
                                       world_size=opt.world_size,
                                       workers=opt.workers)[0]  # testloader

        if not opt.resume:
            labels = np.concatenate(dataset.labels, 0)
            c = torch.tensor(labels[:, 0])  # classes
            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
            # model._initialize_biases(cf.to(device))
            plot_labels(labels, save_dir=log_dir)
            if tb_writer:
                # tb_writer.add_hparams(hyp, {})  # causes duplicate https://github.com/ultralytics/yolov5/pull/384
                tb_writer.add_histogram('classes', c, 0)

            # Anchors
            if not opt.noautoanchor:
                check_anchors(dataset,
                              model=model,
                              thr=hyp['anchor_t'],
                              imgsz=imgsz)
                # raise 'dd'

    # Model parameters
    hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(
        device)  # attach class weights
    model.names = names

    # Start training
    t0 = time.time()
    nw = max(round(hyp['warmup_epochs'] * nb),
             1e3)  # number of warmup iterations, max(3 epochs, 1k iterations)
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0, 0, 0
               )  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
    scheduler.last_epoch = start_epoch - 1  # do not move
    scaler = amp.GradScaler(enabled=cuda)
    logger.info('Image sizes %g train, %g test\n'
                'Using %g dataloader workers\nLogging results to %s\n'
                'Starting training for %g epochs...' %
                (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs))
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        if opt.image_weights:
            # Generate indices
            if rank in [-1, 0]:
                cw = model.class_weights.cpu().numpy() * (
                    1 - maps)**2  # class weights
                iw = labels_to_image_weights(dataset.labels,
                                             nc=nc,
                                             class_weights=cw)  # image weights
                dataset.indices = random.choices(
                    range(dataset.n), weights=iw,
                    k=dataset.n)  # rand weighted idx
            # Broadcast if DDP
            if rank != -1:
                indices = (torch.tensor(dataset.indices)
                           if rank == 0 else torch.zeros(dataset.n)).int()
                dist.broadcast(indices, 0)
                if rank != 0:
                    dataset.indices = indices.cpu().numpy()

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(4, device=device)  # mean losses
        if rank != -1:
            dataloader.sampler.set_epoch(epoch)
        pbar = enumerate(dataloader)
        logger.info(
            ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls',
                                   'total', 'targets', 'img_size'))
        if rank in [-1, 0]:
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()

        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float(
            ) / 255.0  # uint8 to float32, 0-255 to 0.0-1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
                accumulate = max(
                    1,
                    np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(ni, xi, [
                        hyp['warmup_bias_lr'] if j == 2 else 0.0,
                        x['initial_lr'] * lf(epoch)
                    ])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(
                            ni, xi, [hyp['warmup_momentum'], hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(int(imgsz * 0.7),
                                      int(imgsz * 1.3) + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
                          ]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode='bilinear',
                                         align_corners=False)

            # Forward
            with amp.autocast(enabled=cuda):
                pred = model(imgs)  # forward
                loss, loss_items = compute_loss(
                    pred, targets.to(device),
                    model)  # loss scaled by batch_size
                if rank != -1:
                    loss *= opt.world_size  # gradient averaged between devices in DDP mode

            # Backward
            scaler.scale(loss).backward()

            # Optimize
            if ni % accumulate == 0:
                scaler.step(optimizer)  # optimizer.step
                scaler.update()
                optimizer.zero_grad()
                if ema:
                    ema.update(model)

            # Print
            if rank in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1
                                                    )  # update mean losses
                mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9
                                 if torch.cuda.is_available() else 0)  # (GB)
                s = ('%10s' * 2 +
                     '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem,
                                      *mloss, targets.shape[0], imgs.shape[-1])
                pbar.set_description(s)

                # Plot
                if ni < 3:
                    f = str(log_dir / f'train_batch{ni}.jpg')  # filename
                    result = plot_images(images=imgs,
                                         targets=targets,
                                         paths=paths,
                                         fname=f)
                    # if tb_writer and result is not None:
                    # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
                    # tb_writer.add_graph(model, imgs)  # add model to tensorboard

            # end batch ------------------------------------------------------------------------------------------------

        # Scheduler
        lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboard
        scheduler.step()

        # DDP process 0 or single-GPU
        if rank in [-1, 0]:
            # mAP
            if ema:
                ema.update_attr(
                    model,
                    include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride'])
            final_epoch = epoch + 1 == epochs
            if not opt.notest or final_epoch:  # Calculate mAP
                results, maps, times = test.test(
                    opt.data,
                    batch_size=total_batch_size * 2,
                    imgsz=imgsz_test,
                    model=ema.ema,
                    single_cls=opt.single_cls,
                    dataloader=testloader,
                    save_dir=log_dir,
                    plots=epoch == 0 or final_epoch)  # plot first and last

            # Write
            with open(results_file, 'a') as f:
                f.write(
                    s + '%10.4g' * 7 % results +
                    '\n')  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
            if len(opt.name) and opt.bucket:
                os.system('gsutil cp %s gs://%s/results/results%s.txt' %
                          (results_file, opt.bucket, opt.name))

            # Tensorboard
            if tb_writer:
                tags = [
                    'train/box_loss',
                    'train/obj_loss',
                    'train/cls_loss',  # train loss
                    'metrics/precision',
                    'metrics/recall',
                    'metrics/mAP_0.5',
                    'metrics/mAP_0.5:0.95',
                    'val/box_loss',
                    'val/obj_loss',
                    'val/cls_loss',  # val loss
                    'x/lr0',
                    'x/lr1',
                    'x/lr2'
                ]  # params
                for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
                    tb_writer.add_scalar(tag, x, epoch)

            # Update best mAP
            fi = fitness(np.array(results).reshape(
                1, -1))  # weighted combination of [P, R, [email protected], [email protected]]
            if fi > best_fitness:
                best_fitness = fi

            # Save model
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save:
                with open(results_file, 'r') as f:  # create checkpoint
                    ckpt = {
                        'epoch':
                        epoch,
                        'best_fitness':
                        best_fitness,
                        'training_results':
                        f.read(),
                        'model':
                        ema.ema,
                        'optimizer':
                        None if final_epoch else optimizer.state_dict()
                    }

                # Save last, best and delete
                torch.save(ckpt, wdir / 'last{}.pt'.format(epoch))
                if best_fitness == fi:
                    torch.save(ckpt, best)
                del ckpt

        from utils.general import plot_results
        plot_results(save_dir=log_dir)
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    if rank in [-1, 0]:
        # Strip optimizers
        n = opt.name if opt.name.isnumeric() else ''
        fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt'
        for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file],
                          [flast, fbest, fresults]):
            if os.path.exists(f1):
                os.rename(f1, f2)  # rename
                if str(f2).endswith('.pt'):  # is *.pt
                    strip_optimizer(f2)  # strip optimizer
                    os.system(
                        'gsutil cp %s gs://%s/weights' %
                        (f2, opt.bucket)) if opt.bucket else None  # upload
        # Finish
        if not opt.evolve:
            plot_results(save_dir=log_dir)  # save as results.png
        logger.info('%g epochs completed in %.3f hours.\n' %
                    (epoch - start_epoch + 1, (time.time() - t0) / 3600))

    dist.destroy_process_group() if rank not in [-1, 0] else None
    torch.cuda.empty_cache()
    return results
Exemplo n.º 12
0
    def cluster_memory(self):
        self.start_idx = 0
        j = 0
        with torch.no_grad():
            for i_K, K in enumerate(self.num_clusters):
                # run distributed k-means

                # init centroids with elements from memory bank of rank 0
                centroids = torch.empty(
                    K, self.embedding_dim).cuda(non_blocking=True)
                if get_rank() == 0:
                    random_idx = torch.randperm(
                        len(self.local_memory_embeddings[j]))[:K]
                    assert len(random_idx
                               ) >= K, "please reduce the number of centroids"
                    centroids = self.local_memory_embeddings[j][random_idx]
                dist.broadcast(centroids, 0)

                for n_iter in range(self.nmb_kmeans_iters + 1):

                    # E step
                    dot_products = torch.mm(self.local_memory_embeddings[j],
                                            centroids.t())
                    _, assignments = dot_products.max(dim=1)

                    # finish
                    if n_iter == self.nmb_kmeans_iters:
                        break

                    # M step
                    where_helper = get_indices_sparse(
                        assignments.cpu().numpy())
                    counts = torch.zeros(K).cuda(non_blocking=True).int()
                    emb_sums = torch.zeros(
                        K, self.embedding_dim).cuda(non_blocking=True)
                    for k in range(len(where_helper)):
                        if len(where_helper[k][0]) > 0:
                            emb_sums[k] = torch.sum(
                                self.local_memory_embeddings[j][where_helper[k]
                                                                [0]],
                                dim=0,
                            )
                            counts[k] = len(where_helper[k][0])
                    all_reduce_sum(counts)
                    mask = counts > 0
                    all_reduce_sum(emb_sums)
                    centroids[mask] = emb_sums[mask] / counts[mask].unsqueeze(
                        1)

                    # normalize centroids
                    centroids = nn.functional.normalize(centroids, dim=1, p=2)

                getattr(self, "centroids" + str(i_K)).copy_(centroids)
                # gather the assignments
                assignments_all = gather_from_all(assignments)
                indexes_all = gather_from_all(self.local_memory_index)
                self.assignments[i_K] = -100
                self.assignments[i_K][indexes_all] = assignments_all

                j = (j + 1) % self.nmb_mbs

        logging.info(f"Rank: {get_rank()}, clustering of the memory bank done")
Exemplo n.º 13
0
def train(hyp, tb_writer, opt, device):
    print(f'Hyperparameters {hyp}')
    log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution'  # run directory
    wdir = str(Path(log_dir) / 'weights') + os.sep  # weights directory
    os.makedirs(wdir, exist_ok=True)
    last = wdir + 'last.pt'
    best = wdir + 'best.pt'
    results_file = log_dir + os.sep + 'results.txt'
    epochs, batch_size, total_batch_size, weights, rank = \
        opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.local_rank
    # TODO: Init DDP logging. Only the first process is allowed to log.
    # Since I see lots of print here, the logging configuration is skipped here. We may see repeated outputs.

    # Save run settings
    with open(Path(log_dir) / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(Path(log_dir) / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)

    # Configure
    init_seeds(2 + rank)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc, names = (1, ['item']) if opt.single_cls else (int(
        data_dict['nc']), data_dict['names'])  # number classes, names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (
        len(names), nc, opt.data)  # check

    # Remove previous results
    if rank in [-1, 0]:
        for f in glob.glob('*_batch*.jpg') + glob.glob(results_file):
            os.remove(f)

    # Create model
    model = Model(opt.cfg, nc=nc).to(device)

    # Image sizes
    gs = int(max(model.stride))  # grid size (max stride)
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size
                         ]  # verify imgsz are gs-multiples

    # Optimizer
    nbs = 64  # nominal batch size
    # default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html
    # all-reduce operation is carried out during loss.backward().
    # Thus, there would be redundant all-reduce communications in a accumulation procedure,
    # which means, the result is still right but the training speed gets slower.
    # TODO: If acceleration is needed, there is an implementation of allreduce_post_accumulation
    # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py
    accumulate = max(round(nbs / total_batch_size),
                     1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_parameters():
        if v.requires_grad:
            if '.bias' in k:
                pg2.append(v)  # biases
            elif '.weight' in k and '.bn' not in k:
                pg1.append(v)  # apply weight decay
            else:
                pg0.append(v)  # all else

    if hyp['optimizer'] == 'adam':  # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
        optimizer = optim.Adam(pg0,
                               lr=hyp['lr0'],
                               betas=(hyp['momentum'],
                                      0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp['lr0'],
                              momentum=hyp['momentum'],
                              nesterov=True)

    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    print('Optimizer groups: %g .bias, %g conv.weight, %g other' %
          (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Load Model
    with torch_distributed_zero_first(rank):
        google_utils.attempt_download(weights)
    start_epoch, best_fitness = 0, 0.0
    if weights.endswith('.pt'):  # pytorch format
        ckpt = torch.load(weights, map_location=device)  # load checkpoint

        # load model
        try:
            exclude = ['anchor']  # exclude keys
            ckpt['model'] = {
                k: v
                for k, v in ckpt['model'].float().state_dict().items()
                if k in model.state_dict() and not any(x in k for x in exclude)
                and model.state_dict()[k].shape == v.shape
            }
            model.load_state_dict(ckpt['model'], strict=False)
            print('Transferred %g/%g items from %s' %
                  (len(ckpt['model']), len(model.state_dict()), weights))
        except KeyError as e:
            s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \
                "Please delete or update %s and try again, or use --weights '' to train from scratch." \
                % (weights, opt.cfg, weights, weights)
            raise KeyError(s) from e

        # load optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # load results
        if ckpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(ckpt['training_results'])  # write results.txt

        # epochs
        start_epoch = ckpt['epoch'] + 1
        if epochs < start_epoch:
            print(
                '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.'
                % (weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt

    # Mixed precision training https://github.com/NVIDIA/apex
    if mixed_precision:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          verbosity=0)

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    lf = lambda x: ((
        (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.8 + 0.2  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # Logging
    if rank in [-1, 0] and wandb and wandb.run is None:
        opt.hyp = hyp  # add hyperparameters
        wandb_run = wandb.init(
            config=opt,
            resume="allow",
            project=opt.wandb,
            name=Path(log_dir).stem,
            id=ckpt.get('wandb_id') if 'ckpt' in locals() else None)
    loggers = {'wandb': wandb}  # loggers dict

    # DP mode
    if device.type != 'cpu' and rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and device.type != 'cpu' and rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        print('Using SyncBatchNorm()')

    # Exponential moving average
    ema = torch_utils.ModelEMA(model) if rank in [-1, 0] else None

    # DDP mode
    if device.type != 'cpu' and rank != -1:
        model = DDP(model, device_ids=[rank], output_device=rank)

    # Trainloader
    dataloader, dataset = create_dataloader(train_path,
                                            imgsz,
                                            batch_size,
                                            gs,
                                            opt,
                                            hyp=hyp,
                                            augment=True,
                                            cache=opt.cache_images,
                                            rect=opt.rect,
                                            local_rank=rank,
                                            world_size=opt.world_size)
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (
        mlc, nc, opt.data, nc - 1)

    # Testloader
    if rank in [-1, 0]:
        # local_rank is set to -1. Because only the first process is expected to do evaluation.
        testloader = create_dataloader(test_path,
                                       imgsz_test,
                                       total_batch_size,
                                       gs,
                                       opt,
                                       hyp=hyp,
                                       augment=False,
                                       cache=opt.cache_images,
                                       rect=True,
                                       local_rank=-1,
                                       world_size=opt.world_size)[0]

    # Model parameters
    hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(
        device)  # attach class weights
    model.names = names

    # Class frequency
    if rank in [-1, 0]:
        labels = np.concatenate(dataset.labels, 0)
        c = torch.tensor(labels[:, 0])  # classes
        # cf = torch.bincount(c.long(), minlength=nc) + 1.
        # model._initialize_biases(cf.to(device))
        plot_labels(labels, save_dir=log_dir)
        if tb_writer:
            tb_writer.add_histogram('classes', c, 0)

        # Check anchors
        if not opt.noautoanchor:
            check_anchors(dataset,
                          model=model,
                          thr=hyp['anchor_t'],
                          imgsz=imgsz)

    # Start training
    t0 = time.time()
    nw = max(3 * nb,
             1e3)  # number of warmup iterations, max(3 epochs, 1k iterations)
    maps = np.zeros(nc)  # mAP per class
    results = (
        0, 0, 0, 0, 0, 0, 0
    )  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    scheduler.last_epoch = start_epoch - 1  # do not move
    if rank in [0, -1]:
        print('Image sizes %g train, %g test' % (imgsz, imgsz_test))
        print('Using %g dataloader workers' % dataloader.num_workers)
        print('Starting training for %g epochs...' % epochs)
    # torch.autograd.set_detect_anomaly(True)
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        # When in DDP mode, the generated indices will be broadcasted to synchronize dataset.
        if dataset.image_weights:
            # Generate indices.
            if rank in [-1, 0]:
                w = model.class_weights.cpu().numpy() * (
                    1 - maps)**2  # class weights
                image_weights = labels_to_image_weights(dataset.labels,
                                                        nc=nc,
                                                        class_weights=w)
                dataset.indices = random.choices(
                    range(dataset.n), weights=image_weights,
                    k=dataset.n)  # rand weighted idx
            # Broadcast.
            if rank != -1:
                indices = torch.zeros([dataset.n], dtype=torch.int)
                if rank == 0:
                    indices[:] = torch.from_tensor(dataset.indices,
                                                   dtype=torch.int)
                dist.broadcast(indices, 0)
                if rank != 0:
                    dataset.indices = indices.cpu().numpy()

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(4, device=device)  # mean losses
        if rank != -1:
            dataloader.sampler.set_epoch(epoch)
        pbar = enumerate(dataloader)
        if rank in [-1, 0]:
            print(
                ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj',
                                       'cls', 'total', 'targets', 'img_size'))
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float(
            ) / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # giou loss ratio (obj_loss = 1.0 or giou)
                accumulate = max(
                    1,
                    np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(
                        ni, xi,
                        [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi,
                                                  [0.9, hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5,
                                      imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
                          ]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode='bilinear',
                                         align_corners=False)

            # Forward
            pred = model(imgs)

            # Loss
            loss, loss_items = compute_loss(pred, targets.to(device),
                                            model)  # scaled by batch_size
            if rank != -1:
                loss *= opt.world_size  # gradient averaged between devices in DDP mode
            if not torch.isfinite(loss):
                print('WARNING: non-finite loss, ending training ', loss_items)
                return results

            # Backward
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Optimize
            if ni % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()
                if ema is not None:
                    ema.update(model)

            # Print
            if rank in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1
                                                    )  # update mean losses
                mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9
                                 if torch.cuda.is_available() else 0)  # (GB)
                s = ('%10s' * 2 +
                     '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem,
                                      *mloss, targets.shape[0], imgs.shape[-1])
                pbar.set_description(s)

                # Plot
                if ni < 3:
                    f = str(Path(log_dir) /
                            ('train_batch%g.jpg' % ni))  # filename
                    Thread(target=plot_images,
                           args=(imgs, targets, paths, f),
                           daemon=True).start()
                    # result = plot_images(images=imgs, targets=targets, paths=paths, fname=f)
                    # if tb_writer and result is not None:
                    #     tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
                    # tb_writer.add_graph(model, imgs)  # add model to tensorboard
                elif ni == 10 and wandb:
                    wandb.log(
                        {
                            "Mosaics": [
                                wandb.Image(str(x), caption=x.name)
                                for x in Path(log_dir).glob('train*.jpg')
                                if x.exists()
                            ]
                        },
                        commit=False)

            # end batch ------------------------------------------------------------------------------------------------

        # Scheduler
        scheduler.step()

        # Only the first process in DDP mode is allowed to log or save checkpoints.
        if rank in [-1, 0]:
            # mAP
            if ema is not None:
                ema.update_attr(
                    model,
                    include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride'])
            final_epoch = epoch + 1 == epochs
            if not opt.notest or final_epoch:  # Calculate mAP
                results, maps, times = test.test(
                    opt.data,
                    batch_size=total_batch_size,
                    imgsz=imgsz_test,
                    save_json=final_epoch
                    and opt.data.endswith(os.sep + 'coco.yaml'),
                    model=ema.ema.module
                    if hasattr(ema.ema, 'module') else ema.ema,
                    single_cls=opt.single_cls,
                    dataloader=testloader,
                    plots=final_epoch,
                    log_imgs=16 if wandb else 0,
                    save_dir=log_dir)

                # Write
                with open(results_file, 'a') as f:
                    f.write(
                        s + '%10.4g' * 7 % results +
                        '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
                if len(opt.name) and opt.bucket:
                    os.system('gsutil cp %s gs://%s/results/results%s.txt' %
                              (results_file, opt.bucket, opt.name))

                # Tensorboard
                tags = [
                    'train/giou_loss', 'train/obj_loss', 'train/cls_loss',
                    'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5',
                    'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss',
                    'val/cls_loss'
                ]
                for x, tag in zip(list(mloss[:-1]) + list(results), tags):
                    if tb_writer:
                        tb_writer.add_scalar(tag, x, epoch)
                    if wandb:
                        wandb.log({tag: x}, step=epoch,
                                  commit=tag == tags[-1])  # W&B

                # Update best mAP
                fi = fitness(np.array(results).reshape(
                    1,
                    -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
                if fi > best_fitness:
                    best_fitness = fi

            # Save model
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save:
                with open(results_file, 'r') as f:  # create checkpoint
                    ckpt = {
                        'epoch':
                        epoch,
                        'best_fitness':
                        best_fitness,
                        'training_results':
                        f.read(),
                        'model':
                        ema.ema.module if hasattr(ema, 'module') else ema.ema,
                        'optimizer':
                        None if final_epoch else optimizer.state_dict(),
                        'wandb_id':
                        wandb_run.id if wandb else None
                    }

                # Save last, best and delete
                torch.save(ckpt, last)
                if (best_fitness == fi) and not final_epoch:
                    torch.save(ckpt, best)
                del ckpt
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    if rank in [-1, 0]:
        # Strip optimizers
        n = ('_'
             if len(opt.name) and not opt.name.isnumeric() else '') + opt.name
        fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n
        for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'],
                          [flast, fbest, fresults]):
            if os.path.exists(f1):
                os.rename(f1, f2)  # rename
                ispt = f2.endswith('.pt')  # is *.pt
                strip_optimizer(f2) if ispt else None  # strip optimizer
                os.system('gsutil cp %s gs://%s/weights' % (
                    f2, opt.bucket)) if opt.bucket and ispt else None  # upload
        # Finish
        if not opt.evolve:
            plot_results(save_dir=log_dir)  # save as results.png
            if wandb:
                files = [
                    'results.png', 'confusion_matrix.png',
                    *[f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R')]
                ]
                wandb.log({
                    "Results": [
                        wandb.Image(str(Path(log_dir) / f), caption=f)
                        for f in files if (Path(log_dir) / f).exists()
                    ]
                })
                try:
                    print("last:", last)
                    wandb.log_artifact(artifact_or_path=str(last),
                                       type='model',
                                       name="last")
                except ValueError:
                    print("last model not found in", last)
                try:
                    print("flast:", flast)
                    wandb.log_artifact(artifact_or_path=str(flast),
                                       type='model',
                                       name="flast")
                except ValueError:
                    print("flast model not found in", flast)
                try:
                    print("best:", best)
                    wandb.log_artifact(artifact_or_path=str(best),
                                       type='model',
                                       name="best")
                except ValueError:
                    print("best model not found in", best)
                try:
                    print("fbest:", fbest)
                    wandb.log_artifact(artifact_or_path=str(fbest),
                                       type='model',
                                       name="fbest")
                except ValueError:
                    print("fbest model not found in", fbest)
        print('%g epochs completed in %.3f hours.\n' %
              (epoch - start_epoch + 1, (time.time() - t0) / 3600))

    dist.destroy_process_group() if rank not in [-1, 0] else None
    wandb.run.finish() if wandb and wandb.run else None
    torch.cuda.empty_cache()
    return results
Exemplo n.º 14
0
def train(hyp, opt, device, tb_writer=None, wandb=None):
    logger.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))
    save_dir, epochs, batch_size, total_batch_size, weights, rank = \
        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank

    # Directories
    wdir = save_dir / 'weights'
    wdir.mkdir(parents=True, exist_ok=True)  # make dir
    last = wdir / 'last.pt'
    best = wdir / 'best.pt'
    results_file = save_dir / 'results.txt'

    # Save run settings
    with open(save_dir / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(save_dir / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)

    # Configure
    plots = not opt.evolve  # create plots
    cuda = device.type != 'cpu'
    init_seeds(2 + rank)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.SafeLoader)  # data dict
    with torch_distributed_zero_first(rank):
        check_dataset(data_dict)  # check
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc = 1 if opt.single_cls else int(data_dict['nc'])  # number of classes
    names = ['item'] if opt.single_cls and len(data_dict['names']) != 1 else data_dict['names']  # class names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data)  # check

    # Model
    pretrained = weights.endswith('.pt')
    if pretrained:
        with torch_distributed_zero_first(rank):
            attempt_download(weights)  # download if not found locally
        ckpt = torch.load(weights, map_location=device)  # load checkpoint
        if hyp.get('anchors'):
            ckpt['model'].yaml['anchors'] = round(hyp['anchors'])  # force autoanchor
        model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device)  # create
        exclude = []
        state_dict = ckpt['model']  # FP32
        #state_dict = ckpt['model'].float().state_dict()  # to FP32
        state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)  # intersect
        model.load_state_dict(state_dict, strict=True)  # load
        logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))  # report
    else:
        model = Model(opt.cfg, ch=3, nc=nc).to(device)  # create

    # Freeze
    freeze = []  # parameter names to freeze (full or partial)
    for k, v in model.named_parameters():
        v.requires_grad = True  # train all layers
        if any(x in k for x in freeze):
            print('freezing %s' % k)
            v.requires_grad = False

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / total_batch_size), 1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay
    logger.info(f"Scaled weight_decay = {hyp['weight_decay']}")

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_modules():
        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
            pg2.append(v.bias)  # biases
        if isinstance(v, nn.BatchNorm2d):
            pg0.append(v.weight)  # no decay
        elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
            pg1.append(v.weight)  # apply decay

    if opt.adam:
        optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)

    optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
    lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # Logging
    if rank in [-1, 0] and wandb and wandb.run is None:
        opt.hyp = hyp  # add hyperparameters
        wandb_run = wandb.init(config=opt, resume="allow",
                               project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
                               name=save_dir.stem,
                               id=ckpt.get('wandb_id') if 'ckpt' in locals() else None)
    loggers = {'wandb': wandb}  # loggers dict

    # Resume
    start_epoch, best_fitness = 0, 0.0
    if pretrained:
        ## Optimizer
        #if ckpt['optimizer'] is not None:
        #    optimizer.load_state_dict(ckpt['optimizer'])
        #    best_fitness = ckpt['best_fitness']

        ## Results
        #if ckpt.get('training_results') is not None:
        #    with open(results_file, 'w') as file:
        #        file.write(ckpt['training_results'])  # write results.txt

        ## Epochs
        #start_epoch = ckpt['epoch'] + 1
        start_epoch = 0
        if opt.resume:
            assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
        if epochs < start_epoch:
            logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
                        (weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt, state_dict

    # Image sizes
    gs = int(model.stride.max())  # grid size (max stride)
    nl = model.model[-1].nl  # number of detection layers (used for scaling hyp['obj'])
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size]  # verify imgsz are gs-multiples

    # DP mode
    if cuda and rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and cuda and rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        logger.info('Using SyncBatchNorm()')

    # EMA
    ema = ModelEMA(model) if rank in [-1, 0] else None

    # DDP mode
    if cuda and rank != -1:
        model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)

    # Trainloader
    dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
                                            hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank,
                                            world_size=opt.world_size, workers=opt.workers,
                                            image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '))
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1)

    # Process 0
    if rank in [-1, 0]:
        ema.updates = start_epoch * nb // accumulate  # set EMA updates
        testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt,  # testloader
                                       hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1,
                                       world_size=opt.world_size, workers=opt.workers,
                                       pad=0.5, prefix=colorstr('val: '))[0]

        if not opt.resume:
            labels = np.concatenate(dataset.labels, 0)
            c = torch.tensor(labels[:, 0])  # classes
            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
            # model._initialize_biases(cf.to(device))
            if plots:
                plot_labels(labels, save_dir, loggers)
                if tb_writer:
                    tb_writer.add_histogram('classes', c, 0)

            # Anchors
            if not opt.noautoanchor:
                check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)

    # Model parameters
    hyp['box'] *= 3. / nl  # scale to layers
    hyp['cls'] *= nc / 80. * 3. / nl  # scale to classes and layers
    hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl  # scale to image size and layers
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc  # attach class weights
    model.names = names

    # Start training
    t0 = time.time()
    nw = max(round(hyp['warmup_epochs'] * nb), 1000)  # number of warmup iterations, max(3 epochs, 1k iterations)
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0, 0, 0)  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
    scheduler.last_epoch = start_epoch - 1  # do not move
    scaler = amp.GradScaler(enabled=cuda)
    compute_loss = ComputeLoss(model)  # init loss class
    logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n'
                f'Using {dataloader.num_workers} dataloader workers\n'
                f'Logging results to {save_dir}\n'
                f'Starting training for {epochs} epochs...')
    for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
        opt_s = opt.s
        if opt.sr_cos:
            mask_period = 2
            #opt_s = opt.s * lf(epoch)
            opt_s = ((((1 + math.cos(epoch * math.pi / epochs)) / 2) ** 1.0) * 0.8 + 0.2) * opt.s
            if opt.sr and epoch % mask_period == 0 and epoch > 0:
                maskBN(model, soft=True)
        model.train()

        # Update image weights (optional)
        if opt.image_weights:
            # Generate indices
            if rank in [-1, 0]:
                cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc  # class weights
                iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)  # image weights
                dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)  # rand weighted idx
            # Broadcast if DDP
            if rank != -1:
                indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int()
                dist.broadcast(indices, 0)
                if rank != 0:
                    dataset.indices = indices.cpu().numpy()

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(4, device=device)  # mean losses
        if rank != -1:
            dataloader.sampler.set_epoch(epoch)
        pbar = enumerate(dataloader)
        logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size'))
        if rank in [-1, 0]:
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()
        for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
                accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])
                if opt.sr_cos:
                    opt_s = np.interp(ni, xi, [0.0, opt.s * lf(epoch)])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)

            # Forward
            with amp.autocast(enabled=cuda):
                pred = model(imgs)  # forward
                loss, loss_items = compute_loss(pred, targets.to(device))  # loss scaled by batch_size
                if rank != -1:
                    loss *= opt.world_size  # gradient averaged between devices in DDP mode
                if opt.quad:
                    loss *= 4.

            # Backward
            scaler.scale(loss).backward()

            # Optimize
            if ni % accumulate == 0:
                if opt.sr:
                    updateBN(opt_s, model)
                scaler.step(optimizer)  # optimizer.step
                scaler.update()
                optimizer.zero_grad()
                if ema:
                    ema.update(model)

            # Print
            if rank in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
                mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
                s = ('%10s' * 2 + '%10.4g' * 6) % (
                    '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1])
                pbar.set_description(s)

                # Plot
                if plots and ni < 3:
                    f = save_dir / f'train_batch{ni}.jpg'  # filename
                    Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()
                    # if tb_writer:
                    #     tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
                    #     tb_writer.add_graph(model, imgs)  # add model to tensorboard
                elif plots and ni == 10 and wandb:
                    wandb.log({"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg')
                                           if x.exists()]})

            # end batch ------------------------------------------------------------------------------------------------
        # end epoch ----------------------------------------------------------------------------------------------------

        # Scheduler
        lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboard
        scheduler.step()

        # DDP process 0 or single-GPU
        if rank in [-1, 0]:
            # mAP
            if ema:
                ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights'])
            final_epoch = epoch + 1 == epochs
            if not opt.notest or final_epoch:  # Calculate mAP
                results, maps, times = test.test(opt.data,
                                                 batch_size=total_batch_size,
                                                 imgsz=imgsz_test,
                                                 model=ema.ema,
                                                 single_cls=opt.single_cls,
                                                 dataloader=testloader,
                                                 save_dir=save_dir,
                                                 verbose=nc < 50 and final_epoch,
                                                 plots=plots and final_epoch,
                                                 log_imgs=opt.log_imgs if wandb else 0,
                                                 compute_loss=compute_loss)

            # Write
            with open(results_file, 'a') as f:
                f.write(s + '%10.4g' * 7 % results + '\n')  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
            if len(opt.name) and opt.bucket:
                os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))

            # Log
            tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss',  # train loss
                    'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
                    'val/box_loss', 'val/obj_loss', 'val/cls_loss',  # val loss
                    'x/lr0', 'x/lr1', 'x/lr2']  # params
            for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
                if tb_writer:
                    tb_writer.add_scalar(tag, x, epoch)  # tensorboard
                if wandb:
                    wandb.log({tag: x})  # W&B

            # Update best mAP
            fi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, [email protected], [email protected]]
            if fi > best_fitness:
                best_fitness = fi

            # Save model
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save:
                with open(results_file, 'r') as f:  # create checkpoint
                    ckpt = {'epoch': epoch,
                            'best_fitness': best_fitness,
                            'training_results': f.read(),
                            'model': ema.ema,
                            'optimizer': None if final_epoch else optimizer.state_dict(),
                            'wandb_id': wandb_run.id if wandb else None}

                storage_period = 10
                if epoch % storage_period == 0:
                    torch.save(ckpt, os.path.splitext(last)[0]+'_%s'%epoch+os.path.splitext(last)[1])

                # Save last, best and delete
                torch.save(ckpt, last)
                if best_fitness == fi:
                    torch.save(ckpt, best)
                del ckpt
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    if rank in [-1, 0]:
        # Strip optimizers
        final = best if best.exists() else last  # final model
        for f in [last, best]:
            if f.exists():
                strip_optimizer(f)  # strip optimizers
        if opt.bucket:
            os.system(f'gsutil cp {final} gs://{opt.bucket}/weights')  # upload

        # Plots
        if plots:
            plot_results(save_dir=save_dir)  # save as results.png
            if wandb:
                files = ['results.png', 'precision_recall_curve.png', 'confusion_matrix.png']
                wandb.log({"Results": [wandb.Image(str(save_dir / f), caption=f) for f in files
                                       if (save_dir / f).exists()]})
                if opt.log_artifacts:
                    wandb.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem)

        # Test best.pt
        logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
        if opt.data.endswith('coco.yaml') and nc == 80:  # if COCO
            for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]):  # speed, mAP tests
                results, _, _ = test.test(opt.data,
                                          batch_size=total_batch_size,
                                          imgsz=imgsz_test,
                                          conf_thres=conf,
                                          iou_thres=iou,
                                          model=attempt_load(final, device).half(),
                                          single_cls=opt.single_cls,
                                          dataloader=testloader,
                                          save_dir=save_dir,
                                          save_json=save_json,
                                          plots=False)

    else:
        dist.destroy_process_group()

    wandb.run.finish() if wandb and wandb.run else None
    torch.cuda.empty_cache()
    return results
def prune_and_eval(rank, size, orig_fit, acc_constraint, valid, corpus, es,
                   ref_model, num_runs, final_results):
    _valid = valid
    gpu_id = GPU_ID
    total_iterations = es.Tmax / es.popsize
    individual_iter_count = 0
    #ref_model = masked_models[rank]
    X = torch.Tensor(copy.deepcopy(es.pop))
    communicate_size = es.n + 4  # the size of tensors transfer accross computers
    communicate_tensor = torch.FloatTensor(communicate_size * [0.])
    fitness_list = []
    itr_best_remain = 0

    if rank == 0:  # rank 0 is the main process to collect finesses
        X.share_memory_()
        #fitness_list = [torch.FloatTensor([0.0,0.1,0.2,0.3]).share_memory_() for i in range(size)]
        fitness_list = [
            torch.FloatTensor(communicate_size * [0.]).share_memory_()
            for i in range(size)
        ]

    if rank >= 1 and rank < size:  # split tasks to different GPUs
        gpu_id = other_GPU_IDs[rank - 1]

    with cuda.device(gpu_id):

        while (individual_iter_count < total_iterations):
            if rank == 0:  # master node
                itr_X = torch.Tensor(es.ask())
                # broadcast the fathers
                X.copy_(itr_X)
                dist.broadcast(itr_X, 0)
            else:
                # recieve fathers from the source process
                dist.broadcast(X, 0)

            # apply MP on model
            x = X.numpy()[rank]
            ref_model.change_mask(x, apply_MP_on_mask)

            ref_model.apply_mask()

            # evaluate pruned network
            fitness = evaluate_lm(ref_model.masked_model, _valid, corpus,
                                  TEST_BATCH_SIZE)
            communicate_tensor[0] = fitness[0]
            communicate_tensor[1] = fitness[1]
            communicate_tensor[2] = rank
            communicate_tensor[3] = ref_model.get_sparsity()
            for i in range(x.size):
                communicate_tensor[i + 4] = X[rank, i]  #x[i]

            # sync fitness
            if rank == 0:  # collect fitness across processes
                dist.gather(communicate_tensor, gather_list=fitness_list)
            else:
                dist.gather(communicate_tensor, dst=0)

            # judge new solutions
            if rank == 0:  # negatively correlated search in master node
                fit = []
                X_ = []
                for i in range(es.popsize):
                    the_fitness = 100
                    for j in range(len(
                            fitness_list)):  # results of fitness evaluation
                        if int(fitness_list[j]
                               [2]) == i:  # 0:ppl, 1:acc, 2:rank of individual
                            X_.append(fitness_list[j].numpy()[4:])
                            if orig_fit[1] - fitness_list[j][
                                    1] <= acc_constraint:
                                the_fitness = -fitness_list[j][3]
                            else:
                                the_fitness = (orig_fit[1] - fitness_list[j][1]
                                               ) / acc_constraint
                            continue
                    fit.append(the_fitness)

                es.tell(X_, fit)

                itr_best_remain = min(fit)

            final_results['result_NCS'].copy_(torch.Tensor(es.result()[0]))
            individual_iter_count += 1

            if rank == 0:  # record status
                logger.scalar_summary(
                    'ncs_%s_fitness' % num_runs,
                    es.result()[1],
                    num_runs * total_iterations + individual_iter_count)
                logger.scalar_summary(
                    'ncs_%s_best_itr_remain' % num_runs, itr_best_remain,
                    num_runs * total_iterations + individual_iter_count)
                logger.histo_summary(
                    'ncs_%s_pop' % num_runs,
                    es.result()[0],
                    num_runs * total_iterations + individual_iter_count)
                logger.histo_summary(
                    'pop of 1', X_[0],
                    num_runs * total_iterations + individual_iter_count)
                logger.scalar_summary(
                    'sp of 1', -fitness_list[0][3],
                    num_runs * total_iterations + individual_iter_count)
                logger.scalar_summary(
                    'rank of 1', fitness_list[0][2],
                    num_runs * total_iterations + individual_iter_count)
                logger.histo_summary(
                    'pop of 2', X_[1],
                    num_runs * total_iterations + individual_iter_count)
                logger.scalar_summary(
                    'sp of 2', -fitness_list[1][3],
                    num_runs * total_iterations + individual_iter_count)
                logger.scalar_summary(
                    'rank of 2', fitness_list[1][2],
                    num_runs * total_iterations + individual_iter_count)
                #logger.histo_summary('pop of 3', X_[2], num_runs*total_iterations + individual_iter_count)
                #logger.scalar_summary('sp of 3', -fitness_list[2][3], num_runs*total_iterations + individual_iter_count)
                #logger.scalar_summary('rank of 3', fitness_list[2][2], num_runs*total_iterations + individual_iter_count)

    ref_model.clear_cache()
Exemplo n.º 16
0
def broadcast_params(model):
    for p in model.state_dict().values():
        dist.broadcast(p, 0)
Exemplo n.º 17
0
    print("Initialised process group")
    for at in range(20):
        if rank == 0:

            if args.test_correctness:
                torch.randn([
                    sz,
                ], out=tensor)
                rand = np.random.randint(5)
                ind = np.random.randint(100)
                tensor[ind] = rand

            st = time.time()

            dist.broadcast(tensor=tensor, src=0)

            if args.test_correctness:
                tensor.zero_()

            wait_st = time.time()
            dist.broadcast(tensor=tensor, src=1)
            wait_en = time.time()

            print("Time spent in receive call = ", wait_en - wait_st)

            if args.test_correctness:
                assert int(tensor[ind].item()) == rand
                print(f"Attempt {at}: Data was successfully received...")

            en = time.time()
Exemplo n.º 18
0
def main(local_rank, world_size, init_method='tcp://127.0.0.1:23499'):
    dist.init_process_group(backend='nccl',
                            init_method=init_method,
                            rank=local_rank,
                            world_size=world_size)
    cfg.local_rank = local_rank
    torch.cuda.set_device(local_rank)
    cfg.rank = dist.get_rank()
    cfg.world_size = world_size
    print(cfg.rank, dist.get_world_size())
    trainset = MXFaceDataset(root_dir='/root/face_datasets/webface/',
                             local_rank=local_rank)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        trainset, shuffle=True)
    trainloader = DataLoaderX(local_rank=local_rank,
                              dataset=trainset,
                              batch_size=cfg.batch_size,
                              sampler=train_sampler,
                              num_workers=0,
                              pin_memory=True,
                              drop_last=False)
    backbone = iresnet50(False).to(cfg.local_rank)
    backbone.train()
    # backbone = nn.SyncBatchNorm.convert_sync_batchnorm(backbone)
    for ps in backbone.parameters():
        dist.broadcast(ps, 0)

    backbone = torch.nn.parallel.DistributedDataParallel(
        backbone, broadcast_buffers=False, device_ids=[dist.get_rank()])
    backbone.train()
    sub_start, sub_classnum = get_sub_class(cfg.rank, dist.get_world_size())
    print(sub_start, sub_classnum)
    classifier_head = classifier(cfg.embedding_size,
                                 sub_classnum,
                                 sample_rate=0.4)
    cosface = CosFace(s=64.0, m=0.4)
    optimizer = SGD([{
        'params': backbone.parameters()
    }, {
        'params': classifier_head.parameters()
    }],
                    0.1,
                    momentum=0.9,
                    weight_decay=cfg.weight_decay,
                    rescale=cfg.world_size)
    warm_up_with_multistep_lr = lambda epoch: (
        (epoch + 1) / (4 + 1))**2 if epoch < -1 else 0.1**len(
            [m for m in [20, 29] if m - 1 <= epoch])
    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer, lr_lambda=warm_up_with_multistep_lr)
    n_epochs = 33
    start_epoch = 0

    if cfg.local_rank == 0:
        writer = SummaryWriter(log_dir='logs/shows')
    global_step = 0
    loss_fun = nn.CrossEntropyLoss()
    for epoch in range(start_epoch, n_epochs):
        train_sampler.set_epoch(epoch)
        for step, (img, label) in enumerate(trainloader):
            start = time.time()
            lable_gather, norm_weight = classifier_head.prepare(
                label, optimizer)
            x = F.normalize(backbone(img))
            x_gather = torch.zeros(x.size()[0] * cfg.world_size,
                                   cfg.embedding_size,
                                   device=cfg.local_rank)
            dist.all_gather(list(x_gather.chunk(cfg.world_size, dim=0)),
                            x.data)
            x_gather.requires_grad = True

            logits = classifier_head(x_gather, norm_weight)

            logits = cosface(logits, lable_gather)

            with torch.no_grad():
                max_v = torch.max(logits, dim=1, keepdim=True)[0]
                dist.all_reduce(max_v, dist.ReduceOp.MAX)
                exp = torch.exp(logits - max_v)
                sum_exp = exp.sum(dim=1, keepdims=True)
                dist.all_reduce(sum_exp, dist.ReduceOp.SUM)
                exp.div_(sum_exp.clamp_min(1e-20))
                grad = exp
                index = torch.where(lable_gather != -1)[0]
                one_hot = torch.zeros(index.size()[0],
                                      grad.size()[1],
                                      device=grad.device)
                one_hot.scatter_(1, lable_gather[index, None], 1)

                loss = torch.zeros(grad.size()[0], 1, device=grad.device)
                loss[index] = grad[index].gather(1, lable_gather[index, None])
                dist.all_reduce(loss, dist.ReduceOp.SUM)
                loss_v = loss.clamp_min_(1e-20).log_().mean() * (-1)

                grad[index] -= one_hot
                grad.div_(grad.size()[0])

            logits.backward(grad)
            if x_gather.grad is not None:
                x_gather.grad.detach_()
            x_grad = torch.zeros_like(x)
            dist.reduce_scatter(
                x_grad, list(x_gather.grad.chunk(cfg.world_size, dim=0)))
            x.backward(x_grad)
            optimizer.step()
            classifier_head.update()
            optimizer.zero_grad()
            if cfg.rank == 0:
                print(x_gather.grad.max(), x_gather.grad.min())
                print('loss_v', loss_v.item(), global_step)
                writer.add_scalar('loss', loss_v, global_step)
                print('lr',
                      optimizer.state_dict()['param_groups'][0]['lr'],
                      global_step)
                print(cfg.batch_size / (time.time() - start))

            global_step += 1
        scheduler.step()
        if cfg.rank == 0:
            torch.save(backbone.module.state_dict(),
                       "models/" + str(epoch) + 'backbone.pth')
    dist.destroy_process_group()
Exemplo n.º 19
0
# Configuration
ranks_per_node = 8
shape = 2**17
dtype = torch.float32

# Initialize MPI
rank, n_ranks = init_workers_nccl_file()
local_rank = rank % ranks_per_node

# Allocate a small tensor on every gpu from every rank.
# This is an attempt to force creation of all device contexts.
#for i in range(ranks_per_node):
#    _ = torch.randn(1).to(torch.device('cuda', i))

# Select our gpu
device = torch.device('cuda', local_rank)
print('Rank', rank, 'size', n_ranks, 'device', device, 'count', torch.cuda.device_count())

# Allocate a tensor on the gpu
x = torch.randn(shape, dtype=dtype).to(device)
print('local result:', x.sum())

# Do a broadcast from rank 0
dist.broadcast(x, 0)
print('broadcast result:', x.sum())

# Do an all-reduce
dist.all_reduce(x)
print('allreduce result:', x.sum())
Exemplo n.º 20
0
def main():

    args.distributed = True

    print("~~epoch\thours\ttop1Accuracy\n")
    start_time = datetime.now()
    if args.distributed:
        os.environ['WORLD_SIZE'] = str(args.world_size)
        dist.init_process_group(backend=args.dist_backend, init_method = args.dist_url, world_size = args.world_size, rank = int(os.environ['RANK']))
        torch.cuda.set_device(args.local_rank)

        if dist.get_rank() == 0:
            print(str(dist.get_world_size()) + ' number of workers is set up!')

    if dist.get_rank() == 0:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    log_writer = tensorboardX.SummaryWriter(args.save_dir) if dist.get_rank() == 0 else None

    # create model
    model = models.resnet50()

    model = model.cuda()

    #model.para sync
    global param_copy
    param_copy = list(model.parameters())
    for parameter in param_copy:
        dist.broadcast(parameter.data, 0) #group = 0
    if dist.get_rank() == 0:
        print('parameter sync finished')


    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = Signum_SGD.SGD_distribute(param_copy, args, log_writer)

    best_prec1 = 0

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else: print("=> no checkpoint found at '{}'".format(args.resume))


    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    args.sz = 224

    train_loader,val_loader,train_sampler = get_loaders(traindir, valdir, split_data = not args.test_evaluate, seed = args.seed)

    if args.evaluate: return validate(val_loader, model, criterion, epoch, start_time)

    for epoch in range(args.start_epoch, args.epochs):

        adjust_learning_rate(optimizer, epoch)

        if args.distributed:
            train_sampler.set_epoch(epoch)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            train(train_loader, model, criterion, optimizer, epoch, log_writer)

        if args.prof: break
        prec1 = validate(val_loader, model, criterion, epoch, start_time, log_writer)


        if dist.get_rank() == 0:
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            '''
Exemplo n.º 21
0
def train_net(args, config):
    # setup logger
    logger, final_output_path = create_logger(config.OUTPUT_PATH,
                                              args.cfg,
                                              config.DATASET.TRAIN_IMAGE_SET,
                                              split='train')
    model_prefix = os.path.join(final_output_path, config.MODEL_PREFIX)
    if args.log_dir is None:
        args.log_dir = os.path.join(final_output_path, 'tensorboard_logs')

    # pprint.pprint(args)
    # logger.info('training args:{}\n'.format(args))
    # pprint.pprint(config)
    # logger.info('training config:{}\n'.format(pprint.pformat(config)))

    # manually set random seed
    if config.RNG_SEED > -1:
        random.seed(a=config.RNG_SEED)
        np.random.seed(config.RNG_SEED)
        torch.random.manual_seed(config.RNG_SEED)
        torch.cuda.manual_seed_all(config.RNG_SEED)
        torch.backends.cudnn.deterministic = True
        imgaug.random.seed(config.RNG_SEED)

    # cudnn
    torch.backends.cudnn.benchmark = False
    if args.cudnn_off:
        torch.backends.cudnn.enabled = False

    if args.dist:
        model = eval(config.MODULE)(config)
        local_rank = int(os.environ.get('LOCAL_RANK') or 0)
        config.GPUS = str(local_rank)
        torch.cuda.set_device(local_rank)
        master_address = os.environ['MASTER_ADDR']
        master_port = int(os.environ['MASTER_PORT'] or 23456)
        world_size = int(os.environ['WORLD_SIZE'] or 1)
        rank = int(os.environ['RANK'] or 0)

        if rank == 0:
            pprint.pprint(args)
            logger.info('training args:{}\n'.format(args))
            pprint.pprint(config)
            logger.info('training config:{}\n'.format(pprint.pformat(config)))

        if args.slurm:
            distributed.init_process_group(backend='nccl')
        else:
            try:
                distributed.init_process_group(
                    backend='nccl',
                    init_method='tcp://{}:{}'.format(master_address,
                                                     master_port),
                    world_size=world_size,
                    rank=rank,
                    group_name='mtorch')
            except RuntimeError:
                pass
        print(
            f'native distributed, size: {world_size}, rank: {rank}, local rank: {local_rank}'
        )
        torch.cuda.set_device(local_rank)
        config.GPUS = str(local_rank)
        model = model.cuda()
        if not config.TRAIN.FP16:
            model = DDP(model,
                        device_ids=[local_rank],
                        output_device=local_rank,
                        find_unused_parameters=True)

        if rank == 0:
            summary_parameters(
                model.module if isinstance(
                    model, torch.nn.parallel.DistributedDataParallel) else
                model, logger)
            shutil.copy(args.cfg, final_output_path)
            shutil.copy(inspect.getfile(eval(config.MODULE)),
                        final_output_path)

        writer = None
        if args.log_dir is not None:
            tb_log_dir = os.path.join(args.log_dir, 'rank{}'.format(rank))
            if not os.path.exists(tb_log_dir):
                os.makedirs(tb_log_dir)
            writer = SummaryWriter(log_dir=tb_log_dir)

        batch_size = world_size * (sum(config.TRAIN.BATCH_IMAGES) if
                                   isinstance(config.TRAIN.BATCH_IMAGES, list)
                                   else config.TRAIN.BATCH_IMAGES)
        if config.TRAIN.GRAD_ACCUMULATE_STEPS > 1:
            batch_size = batch_size * config.TRAIN.GRAD_ACCUMULATE_STEPS
        base_lr = config.TRAIN.LR * batch_size
        optimizer_grouped_parameters = [{
            'params': [p for n, p in model.named_parameters() if _k in n],
            'lr':
            base_lr * _lr_mult
        } for _k, _lr_mult in config.TRAIN.LR_MULT]
        optimizer_grouped_parameters.append({
            'params': [
                p for n, p in model.named_parameters()
                if all([_k not in n for _k, _ in config.TRAIN.LR_MULT])
            ]
        })
        if config.TRAIN.OPTIMIZER == 'SGD':
            optimizer = optim.SGD(optimizer_grouped_parameters,
                                  lr=config.TRAIN.LR * batch_size,
                                  momentum=config.TRAIN.MOMENTUM,
                                  weight_decay=config.TRAIN.WD)
        elif config.TRAIN.OPTIMIZER == 'Adam':
            optimizer = optim.Adam(optimizer_grouped_parameters,
                                   lr=config.TRAIN.LR * batch_size,
                                   weight_decay=config.TRAIN.WD)
        elif config.TRAIN.OPTIMIZER == 'AdamW':
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=config.TRAIN.LR * batch_size,
                              betas=(0.9, 0.999),
                              eps=1e-6,
                              weight_decay=config.TRAIN.WD,
                              correct_bias=True)
        else:
            raise ValueError('Not support optimizer {}!'.format(
                config.TRAIN.OPTIMIZER))
        total_gpus = world_size

        train_loader, train_sampler = make_dataloader(config,
                                                      mode='train',
                                                      distributed=True,
                                                      num_replicas=world_size,
                                                      rank=rank,
                                                      expose_sampler=True)
        val_loader = make_dataloader(config,
                                     mode='val',
                                     distributed=True,
                                     num_replicas=world_size,
                                     rank=rank)

    else:
        pprint.pprint(args)
        logger.info('training args:{}\n'.format(args))
        pprint.pprint(config)
        logger.info('training config:{}\n'.format(pprint.pformat(config)))

        #os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS
        model = eval(config.MODULE)(config)
        summary_parameters(model, logger)
        shutil.copy(args.cfg, final_output_path)
        shutil.copy(inspect.getfile(eval(config.MODULE)), final_output_path)
        num_gpus = len(config.GPUS.split(','))
        # assert num_gpus <= 1 or (not config.TRAIN.FP16), "Not support fp16 with torch.nn.DataParallel. " \
        #                                                  "Please use amp.parallel.DistributedDataParallel instead."
        if num_gpus > 1 and config.TRAIN.FP16:
            logger.warning("Not support fp16 with torch.nn.DataParallel.")
            config.TRAIN.FP16 = False

        total_gpus = num_gpus
        rank = None
        writer = SummaryWriter(
            log_dir=args.log_dir) if args.log_dir is not None else None

        if hasattr(model, 'setup_adapter'):
            logger.info('Setting up adapter modules!')
            model.setup_adapter()

        # model
        if num_gpus > 1:
            model = torch.nn.DataParallel(
                model,
                device_ids=[int(d) for d in config.GPUS.split(',')]).cuda()
        else:
            torch.cuda.set_device(int(config.GPUS))
            model.cuda()

        # loader
        # train_set = 'train+val' if config.DATASET.TRAIN_WITH_VAL else 'train'
        train_loader = make_dataloader(config, mode='train', distributed=False)
        val_loader = make_dataloader(config, mode='val', distributed=False)
        train_sampler = None

        batch_size = num_gpus * (sum(config.TRAIN.BATCH_IMAGES) if isinstance(
            config.TRAIN.BATCH_IMAGES, list) else config.TRAIN.BATCH_IMAGES)
        if config.TRAIN.GRAD_ACCUMULATE_STEPS > 1:
            batch_size = batch_size * config.TRAIN.GRAD_ACCUMULATE_STEPS
        base_lr = config.TRAIN.LR * batch_size
        optimizer_grouped_parameters = [{
            'params': [p for n, p in model.named_parameters() if _k in n],
            'lr':
            base_lr * _lr_mult
        } for _k, _lr_mult in config.TRAIN.LR_MULT]
        optimizer_grouped_parameters.append({
            'params': [
                p for n, p in model.named_parameters()
                if all([_k not in n for _k, _ in config.TRAIN.LR_MULT])
            ]
        })

        if config.TRAIN.OPTIMIZER == 'SGD':
            optimizer = optim.SGD(optimizer_grouped_parameters,
                                  lr=config.TRAIN.LR * batch_size,
                                  momentum=config.TRAIN.MOMENTUM,
                                  weight_decay=config.TRAIN.WD)
        elif config.TRAIN.OPTIMIZER == 'Adam':
            optimizer = optim.Adam(optimizer_grouped_parameters,
                                   lr=config.TRAIN.LR * batch_size,
                                   weight_decay=config.TRAIN.WD)
        elif config.TRAIN.OPTIMIZER == 'AdamW':
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=config.TRAIN.LR * batch_size,
                              betas=(0.9, 0.999),
                              eps=1e-6,
                              weight_decay=config.TRAIN.WD,
                              correct_bias=True)
        else:
            raise ValueError('Not support optimizer {}!'.format(
                config.TRAIN.OPTIMIZER))

    # partial load pretrain state dict
    if config.NETWORK.PARTIAL_PRETRAIN != "":
        pretrain_state_dict = torch.load(
            config.NETWORK.PARTIAL_PRETRAIN,
            map_location=lambda storage, loc: storage)['state_dict']
        prefix_change = [
            prefix_change.split('->')
            for prefix_change in config.NETWORK.PARTIAL_PRETRAIN_PREFIX_CHANGES
        ]
        if len(prefix_change) > 0:
            pretrain_state_dict_parsed = {}
            for k, v in pretrain_state_dict.items():
                no_match = True
                for pretrain_prefix, new_prefix in prefix_change:
                    if k.startswith(pretrain_prefix):
                        k = new_prefix + k[len(pretrain_prefix):]
                        pretrain_state_dict_parsed[k] = v
                        no_match = False
                        break
                if no_match:
                    pretrain_state_dict_parsed[k] = v
            pretrain_state_dict = pretrain_state_dict_parsed
        smart_partial_load_model_state_dict(model, pretrain_state_dict)

    # pretrained classifier
    # if config.NETWORK.CLASSIFIER_PRETRAINED:
    #     print('Initializing classifier weight from pretrained word embeddings...')
    #     answers_word_embed = []
    #     for k, v in model.state_dict().items():
    #         if 'word_embeddings.weight' in k:
    #             word_embeddings = v.detach().clone()
    #             break
    #     for answer in train_loader.dataset.answer_vocab:
    #         a_tokens = train_loader.dataset.tokenizer.tokenize(answer)
    #         a_ids = train_loader.dataset.tokenizer.convert_tokens_to_ids(a_tokens)
    #         a_word_embed = (torch.stack([word_embeddings[a_id] for a_id in a_ids], dim=0)).mean(dim=0)
    #         answers_word_embed.append(a_word_embed)
    #     answers_word_embed_tensor = torch.stack(answers_word_embed, dim=0)
    #     for name, module in model.named_modules():
    #         if name.endswith('final_mlp'):
    #             module[-1].weight.data = answers_word_embed_tensor.to(device=module[-1].weight.data.device)

    # metrics
    train_metrics_list = [
        cls_metrics.Accuracy(allreduce=args.dist,
                             num_replicas=world_size if args.dist else 1)
    ]
    val_metrics_list = [
        cls_metrics.Accuracy(allreduce=args.dist,
                             num_replicas=world_size if args.dist else 1),
        cls_metrics.RocAUC(allreduce=args.dist,
                           num_replicas=world_size if args.dist else 1)
    ]
    for output_name, display_name in config.TRAIN.LOSS_LOGGERS:
        train_metrics_list.append(
            cls_metrics.LossLogger(
                output_name,
                display_name=display_name,
                allreduce=args.dist,
                num_replicas=world_size if args.dist else 1))

    train_metrics = CompositeEvalMetric()
    val_metrics = CompositeEvalMetric()
    for child_metric in train_metrics_list:
        train_metrics.add(child_metric)
    for child_metric in val_metrics_list:
        val_metrics.add(child_metric)

    # epoch end callbacks
    epoch_end_callbacks = []
    if (rank is None) or (rank == 0):
        epoch_end_callbacks = [
            Checkpoint(model_prefix, config.CHECKPOINT_FREQUENT)
        ]
    validation_monitor = ValidationMonitor(
        do_validation,
        val_loader,
        val_metrics,
        host_metric_name='RocAUC',
        label_index_in_batch=config.DATASET.LABEL_INDEX_IN_BATCH,
        model_dir=os.path.dirname(model_prefix))

    # optimizer initial lr before
    for group in optimizer.param_groups:
        group.setdefault('initial_lr', group['lr'])

    # resume/auto-resume
    if rank is None or rank == 0:
        smart_resume(model, optimizer, validation_monitor, config,
                     model_prefix, logger)
    if args.dist:
        begin_epoch = torch.tensor(config.TRAIN.BEGIN_EPOCH).cuda()
        distributed.broadcast(begin_epoch, src=0)
        config.TRAIN.BEGIN_EPOCH = begin_epoch.item()

    # batch end callbacks
    batch_size = len(config.GPUS.split(',')) * config.TRAIN.BATCH_IMAGES
    batch_end_callbacks = [
        Speedometer(batch_size,
                    config.LOG_FREQUENT,
                    batches_per_epoch=len(train_loader),
                    epochs=config.TRAIN.END_EPOCH - config.TRAIN.BEGIN_EPOCH)
    ]

    # setup lr step and lr scheduler
    if config.TRAIN.LR_SCHEDULE == 'plateau':
        print("Warning: not support resuming on plateau lr schedule!")
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='max',
            factor=config.TRAIN.LR_FACTOR,
            patience=1,
            verbose=True,
            threshold=1e-4,
            threshold_mode='rel',
            cooldown=2,
            min_lr=0,
            eps=1e-8)
    elif config.TRAIN.LR_SCHEDULE == 'triangle':
        lr_scheduler = WarmupLinearSchedule(
            optimizer,
            config.TRAIN.WARMUP_STEPS if config.TRAIN.WARMUP else 0,
            t_total=int(config.TRAIN.END_EPOCH * len(train_loader) /
                        config.TRAIN.GRAD_ACCUMULATE_STEPS),
            last_epoch=int(config.TRAIN.BEGIN_EPOCH * len(train_loader) /
                           config.TRAIN.GRAD_ACCUMULATE_STEPS) - 1)
    elif config.TRAIN.LR_SCHEDULE == 'step':
        lr_iters = [
            int(epoch * len(train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS)
            for epoch in config.TRAIN.LR_STEP
        ]
        lr_scheduler = WarmupMultiStepLR(
            optimizer,
            milestones=lr_iters,
            gamma=config.TRAIN.LR_FACTOR,
            warmup_factor=config.TRAIN.WARMUP_FACTOR,
            warmup_iters=config.TRAIN.WARMUP_STEPS
            if config.TRAIN.WARMUP else 0,
            warmup_method=config.TRAIN.WARMUP_METHOD,
            last_epoch=int(config.TRAIN.BEGIN_EPOCH * len(train_loader) /
                           config.TRAIN.GRAD_ACCUMULATE_STEPS) - 1)
    else:
        raise ValueError("Not support lr schedule: {}.".format(
            config.TRAIN.LR_SCHEDULE))

    if config.TRAIN.SWA:
        assert config.TRAIN.SWA_START_EPOCH < config.TRAIN.END_EPOCH
        if not config.TRAIN.DEBUG:
            true_epoch_step = len(
                train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS
        else:
            true_epoch_step = 50
        step_per_cycle = config.TRAIN.SWA_EPOCH_PER_CYCLE * true_epoch_step

        # swa_scheduler = torch.optim.lr_scheduler.CyclicLR(
        #     optimizer,
        #     base_lr=config.TRAIN.SWA_MIN_LR * batch_size,
        #     max_lr=config.TRAIN.SWA_MAX_LR * batch_size,
        #     cycle_momentum=False,
        #     step_size_up=10,
        #     step_size_down=step_per_cycle - 10)

        anneal_steps = max(
            1, (config.TRAIN.END_EPOCH - config.TRAIN.SWA_START_EPOCH) //
            4) * step_per_cycle
        anneal_steps = int(anneal_steps)
        swa_scheduler = SWALR(optimizer,
                              anneal_epochs=anneal_steps,
                              anneal_strategy='linear',
                              swa_lr=config.TRAIN.SWA_MAX_LR * batch_size)
    else:
        swa_scheduler = None

    if config.TRAIN.ROC_STAR:
        assert config.TRAIN.ROC_START_EPOCH < config.TRAIN.END_EPOCH
        roc_star = RocStarLoss(
            delta=2.0,
            sample_size=config.TRAIN.ROC_SAMPLE_SIZE,
            sample_size_gamma=config.TRAIN.ROC_SAMPLE_SIZE * 2,
            update_gamma_each=config.TRAIN.ROC_SAMPLE_SIZE,
        )
    else:
        roc_star = None

    # broadcast parameter and optimizer state from rank 0 before training start
    if args.dist:
        for v in model.state_dict().values():
            distributed.broadcast(v, src=0)
        # for v in optimizer.state_dict().values():
        #     distributed.broadcast(v, src=0)
        best_epoch = torch.tensor(validation_monitor.best_epoch).cuda()
        best_val = torch.tensor(validation_monitor.best_val).cuda()
        distributed.broadcast(best_epoch, src=0)
        distributed.broadcast(best_val, src=0)
        validation_monitor.best_epoch = best_epoch.item()
        validation_monitor.best_val = best_val.item()

    # apex: amp fp16 mixed-precision training
    if config.TRAIN.FP16:
        # model.apply(bn_fp16_half_eval)
        model, optimizer = amp.initialize(
            model,
            optimizer,
            opt_level='O2',
            keep_batchnorm_fp32=False,
            loss_scale=config.TRAIN.FP16_LOSS_SCALE,
            min_loss_scale=32.0)
        if args.dist:
            model = Apex_DDP(model, delay_allreduce=True)

    # NOTE: final_model == model if not using SWA, else final_model == AveragedModel(model)
    final_model = train(
        model,
        optimizer,
        lr_scheduler,
        train_loader,
        train_sampler,
        train_metrics,
        config.TRAIN.BEGIN_EPOCH,
        config.TRAIN.END_EPOCH,
        logger,
        fp16=config.TRAIN.FP16,
        rank=rank,
        writer=writer,
        batch_end_callbacks=batch_end_callbacks,
        epoch_end_callbacks=epoch_end_callbacks,
        validation_monitor=validation_monitor,
        clip_grad_norm=config.TRAIN.CLIP_GRAD_NORM,
        gradient_accumulate_steps=config.TRAIN.GRAD_ACCUMULATE_STEPS,
        ckpt_path=config.TRAIN.CKPT_PATH,
        swa_scheduler=swa_scheduler,
        swa_start_epoch=config.TRAIN.SWA_START_EPOCH,
        swa_cycle_epoch=config.TRAIN.SWA_EPOCH_PER_CYCLE,
        swa_use_scheduler=config.TRAIN.SWA_SCHEDULE,
        roc_star=roc_star,
        roc_star_start_epoch=config.TRAIN.ROC_START_EPOCH,
        roc_interleave=config.TRAIN.ROC_INTERLEAVE,
        debug=config.TRAIN.DEBUG,
    )

    return rank, final_model
Exemplo n.º 22
0
 def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
     dist.broadcast(tensor, src=src)
     return tensor
 def sync_parameters(self):
     for param in self.module.parameters():
         dist.broadcast(param.data, 0)
Exemplo n.º 24
0
 def weight_broadcast(self):
     for param in self.module.parameters():
         dist.broadcast(param.data, 0)
Exemplo n.º 25
0
def train(hyp, opt, device, tb_writer=None):
    print(f'Hyperparameters {hyp}')
    log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve'  # logging directory
    wdir = str(log_dir / 'weights') + os.sep  # weights directory
    os.makedirs(wdir, exist_ok=True)
    last = wdir + 'last.pt'
    best = wdir + 'best.pt'
    results_file = str(log_dir / 'results.txt')
    epochs, batch_size, total_batch_size, weights, rank = \
        opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank

    # TODO: Use DDP logging. Only the first process is allowed to log.
    # Save run settings
    with open(log_dir / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(log_dir / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)

    # Configure
    cuda = device.type != 'cpu'
    init_seeds(2 + rank)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names'])  # number classes, names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data)  # check

    # Model
    pretrained = weights.endswith('.pt')
    if pretrained:
        with torch_distributed_zero_first(rank):
            attempt_download(weights)  # download if not found locally
        ckpt = torch.load(weights, map_location=device)  # load checkpoint
        model = Darknet(opt.cfg).to(device)  # create
        state_dict = {k: v for k, v in ckpt['model'].items() if model.state_dict()[k].numel() == v.numel()}
        model.load_state_dict(state_dict, strict=False)
        print('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))  # report
    else:
        model = Darknet(opt.cfg).to(device) # create

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / total_batch_size), 1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in dict(model.named_parameters()).items():
        if '.bias' in k:
            pg2.append(v)  # biases
        elif 'Conv2d.weight' in k:
            pg1.append(v)  # apply weight_decay
        else:
            pg0.append(v)  # all else

    if opt.adam:
        optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)

    optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
    lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.8 + 0.2  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # Resume
    start_epoch, best_fitness = 0, 0.0
    if pretrained:
        # Optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # Results
        if ckpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(ckpt['training_results'])  # write results.txt

        # Epochs
        start_epoch = ckpt['epoch'] + 1
        if epochs < start_epoch:
            print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
                  (weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt, state_dict
    
    # Image sizes
    gs = 32 # grid size (max stride)
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size]  # verify imgsz are gs-multiples

    # DP mode
    if cuda and rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and cuda and rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        print('Using SyncBatchNorm()')

    # Exponential moving average
    ema = ModelEMA(model) if rank in [-1, 0] else None

    # DDP mode
    if cuda and rank != -1:
        model = DDP(model, device_ids=[opt.local_rank], output_device=(opt.local_rank))

    # Trainloader
    dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True,
                                            cache=opt.cache_images, rect=opt.rect, local_rank=rank,
                                            world_size=opt.world_size)
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1)

    # Testloader
    if rank in [-1, 0]:
        ema.updates = start_epoch * nb // accumulate  # set EMA updates ***
        # local_rank is set to -1. Because only the first process is expected to do evaluation.
        testloader = create_dataloader(test_path, imgsz_test, batch_size, gs, opt, hyp=hyp, augment=False,
                                       cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0]

    # Model parameters
    hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
    model.names = names

    # Class frequency
    if rank in [-1, 0]:
        labels = np.concatenate(dataset.labels, 0)
        c = torch.tensor(labels[:, 0])  # classes
        # cf = torch.bincount(c.long(), minlength=nc) + 1.
        # model._initialize_biases(cf.to(device))
        plot_labels(labels, save_dir=log_dir)
        if tb_writer:
            tb_writer.add_histogram('classes', c, 0)

        # Check anchors
        #if not opt.noautoanchor:
        #    check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)

    # Start training
    t0 = time.time()
    nw = max(3 * nb, 1e3)  # number of warmup iterations, max(3 epochs, 1k iterations)
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0, 0, 0)  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    scheduler.last_epoch = start_epoch - 1  # do not move
    scaler = amp.GradScaler(enabled=cuda)
    if rank in [0, -1]:
        print('Image sizes %g train, %g test' % (imgsz, imgsz_test))
        print('Using %g dataloader workers' % dataloader.num_workers)
        print('Starting training for %g epochs...' % epochs)
    # torch.autograd.set_detect_anomaly(True)
    for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        if dataset.image_weights:
            # Generate indices
            if rank in [-1, 0]:
                w = model.class_weights.cpu().numpy() * (1 - maps) ** 2  # class weights
                image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w)
                dataset.indices = random.choices(range(dataset.n), weights=image_weights,
                                                 k=dataset.n)  # rand weighted idx
            # Broadcast if DDP
            if rank != -1:
                indices = torch.zeros([dataset.n], dtype=torch.int)
                if rank == 0:
                    indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int)
                dist.broadcast(indices, 0)
                if rank != 0:
                    dataset.indices = indices.cpu().numpy()

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(4, device=device)  # mean losses
        if rank != -1:
            dataloader.sampler.set_epoch(epoch)
        pbar = enumerate(dataloader)
        if rank in [-1, 0]:
            print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()
        for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # giou loss ratio (obj_loss = 1.0 or giou)
                accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)

            # Autocast
            with amp.autocast(enabled=cuda):
                # Forward
                pred = model(imgs)

                # Loss
                loss, loss_items = compute_loss(pred, targets.to(device), model)  # scaled by batch_size
                if rank != -1:
                    loss *= opt.world_size  # gradient averaged between devices in DDP mode
                # if not torch.isfinite(loss):
                #     print('WARNING: non-finite loss, ending training ', loss_items)
                #     return results

            # Backward
            scaler.scale(loss).backward()

            # Optimize
            if ni % accumulate == 0:
                scaler.step(optimizer)  # optimizer.step
                scaler.update()
                optimizer.zero_grad()
                if ema is not None:
                    ema.update(model)

            # Print
            if rank in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
                mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
                s = ('%10s' * 2 + '%10.4g' * 6) % (
                    '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1])
                pbar.set_description(s)

                # Plot
                if ni < 3:
                    f = str(log_dir / ('train_batch%g.jpg' % ni))  # filename
                    result = plot_images(images=imgs, targets=targets, paths=paths, fname=f)
                    if tb_writer and result is not None:
                        tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
                        # tb_writer.add_graph(model, imgs)  # add model to tensorboard

            # end batch ------------------------------------------------------------------------------------------------

        # Scheduler
        scheduler.step()

        # DDP process 0 or single-GPU
        if rank in [-1, 0]:
            # mAP
            if ema is not None:
                ema.update_attr(model)
            final_epoch = epoch + 1 == epochs
            if not opt.notest or final_epoch:  # Calculate mAP
                results, maps, times = test.test(opt.data,
                                                 batch_size=batch_size,
                                                 imgsz=imgsz_test,
                                                 save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'),
                                                 model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema,
                                                 single_cls=opt.single_cls,
                                                 dataloader=testloader,
                                                 save_dir=log_dir)

            # Write
            with open(results_file, 'a') as f:
                f.write(s + '%10.4g' * 7 % results + '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
            if len(opt.name) and opt.bucket:
                os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))

            # Tensorboard
            if tb_writer:
                tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss',
                        'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
                        'val/giou_loss', 'val/obj_loss', 'val/cls_loss']
                for x, tag in zip(list(mloss[:-1]) + list(results), tags):
                    tb_writer.add_scalar(tag, x, epoch)

            # Update best mAP
            fi = fitness(np.array(results).reshape(1, -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
            if fi > best_fitness:
                best_fitness = fi

            # Save model
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save:
                with open(results_file, 'r') as f:  # create checkpoint
                    ckpt = {'epoch': epoch,
                            'best_fitness': best_fitness,
                            'training_results': f.read(),
                            'model': ema.ema.module.state_dict() if hasattr(ema, 'module') else ema.ema.state_dict(),
                            'optimizer': None if final_epoch else optimizer.state_dict()}

                # Save last, best and delete
                torch.save(ckpt, last)
                if epoch >= (epochs-5):
                    torch.save(ckpt, last.replace('.pt','_{:03d}.pt'.format(epoch)))
                if (best_fitness == fi) and not final_epoch:
                    torch.save(ckpt, best)
                del ckpt
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    if rank in [-1, 0]:
        # Strip optimizers
        n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name
        fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n
        for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]):
            if os.path.exists(f1):
                os.rename(f1, f2)  # rename
                ispt = f2.endswith('.pt')  # is *.pt
                strip_optimizer(f2) if ispt else None  # strip optimizer
                os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None  # upload
        # Finish
        if not opt.evolve:
            plot_results(save_dir=log_dir)  # save as results.png
        print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))

    dist.destroy_process_group() if rank not in [-1, 0] else None
    torch.cuda.empty_cache()
    return results
Exemplo n.º 26
0
def broadcast(tensor, src):
    return dist.broadcast(tensor, src=src)
Exemplo n.º 27
0
def run(rank, size):
    local_train_length = 3000
    local_test_length = 333
    train_indices = torch.zeros([3, local_train_length], dtype=torch.long)
    test_indices = torch.zeros([3, local_test_length], dtype=torch.long)

    local_data_path = '/home/cream/Desktop/arafin_experiments/SOCC/FL-SNN/data/'
    save_path = os.getcwd() + r'/results'

    datasets = {'mnist_dvs_10': r'mnist_dvs_25ms_26pxl_10_digits.hdf5'}
    dataset = local_data_path + datasets['mnist_dvs_10']

    input_train = torch.FloatTensor(
        tables.open_file(dataset).root.train.data[:])
    output_train = torch.FloatTensor(
        tables.open_file(dataset).root.train.label[:])

    input_test = torch.FloatTensor(tables.open_file(dataset).root.test.data[:])
    output_test = torch.FloatTensor(
        tables.open_file(dataset).root.test.label[:])
    ### Network parameters
    n_input_neurons = input_train.shape[1]
    n_output_neurons = output_train.shape[1]
    n_hidden_neurons = 4
    epochs = local_train_length
    epochs_test = local_test_length

    learning_rate = 0.005 / n_hidden_neurons
    kappa = 0.2
    alpha = 1
    deltas = 1
    num_ite = 1
    r = 0.3
    weights_magnitude = 0.05
    task = 'supervised'
    mode = 'train',
    tau_ff = 10
    tau_fb = 10
    tau = 10
    mu = 1.5,
    n_basis_feedforward = 8
    feedforward_filter = filters.raised_cosine_pillow_08
    feedback_filter = filters.raised_cosine_pillow_08
    n_basis_feedback = 1
    topology = torch.ones([
        n_hidden_neurons + n_output_neurons,
        n_input_neurons + n_hidden_neurons + n_output_neurons
    ],
                          dtype=torch.float)
    topology[[i for i in range(n_output_neurons + n_hidden_neurons)], [
        i + n_input_neurons for i in range(n_output_neurons + n_hidden_neurons)
    ]] = 0
    assert torch.sum(topology[:, :n_input_neurons]) == (
        n_input_neurons * (n_hidden_neurons + n_output_neurons))
    print(topology[:, n_input_neurons:])
    # Create the network
    network = SNNetwork(**utils.training_utils.make_network_parameters(
        n_input_neurons,
        n_output_neurons,
        n_hidden_neurons,
        topology_type='fully_connected'))

    # At the beginning, the master node:
    # - transmits its weights to the workers
    # - distributes the samples among workers
    if rank == 0:
        # Initializing an aggregation list for future weights collection
        weights_list = [
            [
                torch.zeros(network.feedforward_weights.shape,
                            dtype=torch.float) for _ in range(size)
            ],
            [
                torch.zeros(network.feedback_weights.shape, dtype=torch.float)
                for _ in range(size)
            ],
            [
                torch.zeros(network.bias.shape, dtype=torch.float)
                for _ in range(size)
            ], [torch.zeros(1, dtype=torch.float) for _ in range(size)]
        ]
    else:
        weights_list = []

    if rank == 0:
        train_indicess = torch.tensor(np.random.choice(np.arange(
            input_train.shape[0]), [3, local_train_length],
                                                       replace=False),
                                      dtype=torch.long)
        test_indicess = torch.tensor(np.random.choice(np.arange(
            input_test.shape[0]), [3, local_test_length],
                                                      replace=False),
                                     dtype=torch.long)
        dist.send(tensor=train_indicess, dst=1)
        dist.send(tensor=train_indicess, dst=2)
        dist.send(tensor=train_indicess, dst=3)
    else:
        dist.recv(tensor=train_indices, src=0)
    dist.barrier()

    if rank == 0:
        dist.send(tensor=test_indicess, dst=1)
        dist.send(tensor=test_indicess, dst=2)
        dist.send(tensor=test_indicess, dst=3)
    else:
        dist.recv(tensor=test_indices, src=0)
    dist.barrier()
    if rank != 0:
        training_data = input_train[train_indices[rank - 1, :]]
        training_label = output_train[train_indices[rank - 1, :]]
        test_data = input_test[test_indices[rank - 1, :]]
        test_label = output_test[test_indices[rank - 1, :]]

        indices = np.random.choice(np.arange(training_data.shape[0]),
                                   [training_data.shape[0]],
                                   replace=True)
        S_prime = training_data.shape[-1]
        S = epochs * S_prime
        print("S is", S)
    dist.barrier()

    group = dist.group.WORLD
    # Master node sends its weights
    for parameter in network.get_parameters():
        dist.broadcast(network.get_parameters()[parameter], 0)
    if rank == 0:
        print(
            'Node 0 has shared its model and training data is partitioned among workers'
        )
    # The nodes initialize their eligibility trace and learning signal
    eligibility_trace = {'ff_weights': 0, 'fb_weights': 0, 'bias': 0}
    et_temp = {'ff_weights': 0, 'fb_weights': 0, 'bias': 0}

    learning_signal = 0
    ls_temp = 0
    dist.barrier()
    num_ite = 1

    test_accs = []
    if rank != 0:
        test_indx = np.random.choice(np.arange(test_data.shape[0]),
                                     [test_data.shape[0]],
                                     replace=False)
        np.random.shuffle(test_indx)

        _, loss = get_acc_and_loss(network, test_data[test_indx],
                                   test_label[test_indx])

        network.set_mode('train')
        local_training_sequence = torch.cat((training_data, training_label),
                                            dim=1)
    dist.barrier()
    ### First local step
    for i in range(num_ite):
        for s in range(deltas):
            if rank != 0:
                # Feedforward sampling step
                log_proba, learning_signal, eligibility_trace \
                    = feedforward_sampling(network, local_training_sequence[indices[0]], eligibility_trace, learning_signal, s, S_prime, alpha, r)

        if rank != 0:
            # First local update
            for parameter in eligibility_trace:
                eligibility_trace[parameter][
                    network.hidden_neurons -
                    network.n_non_learnable_neurons] *= learning_signal
                network.get_parameters(
                )[parameter] += eligibility_trace[parameter] * learning_rate

        # First global update
        if (s + 1) % (tau * deltas) == 0:
            dist.barrier()
            global_update(group, rank, network, weights_list)
            dist.barrier()

        S = input_train.shape[-1] * local_train_length
        ### Remainder of the steps
        for s in range(deltas, S):
            print(s)
            if rank != 0:
                if s % S_prime == 0:  # Reset internal state for each example
                    network.reset_internal_state()

                # lr decay
                if (s % S / 5 == 0) & (learning_rate > 0.005):
                    learning_rate /= 2

                # Feedforward sampling
                log_proba, ls_temp, et_temp \
                    = feedforward_sampling(network, local_training_sequence[indices[0]], et_temp, ls_temp, s, S_prime, alpha, r)

                # Local feedback and global update
                learning_signal, ls_temp, eligibility_trace, et_temp \
                    = local_feedback_and_update(network, eligibility_trace, learning_signal, et_temp, ls_temp, learning_rate, kappa, s, deltas)

                ## Every few timesteps, record test losses
                if (s + 1) % 40 == 0:
                    _, loss = get_acc_and_loss(network, test_data[test_indx],
                                               test_label[test_indx])

                    network.set_mode('train')

            # Global update
            if (s + 1) % (tau * deltas) == 0:
                dist.barrier()
                global_update(group, rank, network, weights_list)
                dist.barrier()

        if rank == 0:
            global_test_indices = np.random.choice(np.arange(
                input_test.shape[0]), [epochs_test],
                                                   replace=False)
            np.random.shuffle(global_test_indices)
            print(global_test_indices)
            global_acc, _ = get_acc_and_loss(network,
                                             input_test[global_test_indices],
                                             output_test[global_test_indices])
            print('Final global test accuracy: %f' % global_acc)
Exemplo n.º 28
0
def train_net(args, config):
    # setup logger
    logger, final_output_path = create_logger(config.OUTPUT_PATH,
                                              args.cfg,
                                              config.DATASET.IMAGE_SET,
                                              split='train')
    model_prefix = os.path.join(final_output_path, config.MODEL_PREFIX)
    if args.log_dir is None:
        args.log_dir = os.path.join(final_output_path, 'tensorboard_logs')

    pprint.pprint(args)
    logger.info('training args:{}\n'.format(args))
    pprint.pprint(config)
    logger.info('training config:{}\n'.format(pprint.pformat(config)))

    # manually set random seed
    if config.RNG_SEED > -1:
        np.random.seed(config.RNG_SEED)
        torch.random.manual_seed(config.RNG_SEED)
        torch.cuda.manual_seed_all(config.RNG_SEED)

    # cudnn
    torch.backends.cudnn.benchmark = False
    if args.cudnn_off:
        torch.backends.cudnn.enabled = False

    if args.dist:
        model = eval(config.MODULE)(config)
        local_rank = int(os.environ.get('LOCAL_RANK') or 0)
        os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS
        torch.cuda.set_device(local_rank)
        master_address = os.environ['MASTER_ADDR']
        master_port = int(os.environ['MASTER_PORT'] or 23456)
        world_size = int(os.environ['WORLD_SIZE'] or 1)
        rank = int(os.environ['RANK'] or 0)
        if args.slurm:
            distributed.init_process_group(backend='nccl')
        else:
            distributed.init_process_group(backend='nccl',
                                           init_method='tcp://{}:{}'.format(
                                               master_address, master_port),
                                           world_size=world_size,
                                           rank=rank,
                                           group_name='mtorch')
        print(
            f'native distributed, size: {world_size}, rank: {rank}, local rank: {local_rank}'
        )
        torch.cuda.set_device(local_rank)
        config.GPUS = str(local_rank)
        model = model.cuda()
        if not config.TRAIN.FP16:
            model = DDP(model,
                        device_ids=[local_rank],
                        output_device=local_rank)

        if rank == 0:
            summary_parameters(
                model.module if isinstance(
                    model, torch.nn.parallel.DistributedDataParallel) else
                model, logger)
            shutil.copy(args.cfg, final_output_path)
            shutil.copy(inspect.getfile(eval(config.MODULE)),
                        final_output_path)

        writer = None
        if args.log_dir is not None:
            tb_log_dir = os.path.join(args.log_dir, 'rank{}'.format(rank))
            if not os.path.exists(tb_log_dir):
                os.makedirs(tb_log_dir)
            writer = SummaryWriter(log_dir=tb_log_dir)

        train_loader, train_sampler = make_dataloader(config,
                                                      mode='train',
                                                      distributed=True,
                                                      num_replicas=world_size,
                                                      rank=rank,
                                                      expose_sampler=True)
        val_loader = make_dataloader(config,
                                     mode='val',
                                     distributed=True,
                                     num_replicas=world_size,
                                     rank=rank)

        batch_size = world_size * (sum(config.TRAIN.BATCH_IMAGES) if
                                   isinstance(config.TRAIN.BATCH_IMAGES, list)
                                   else config.TRAIN.BATCH_IMAGES)
        if config.TRAIN.GRAD_ACCUMULATE_STEPS > 1:
            batch_size = batch_size * config.TRAIN.GRAD_ACCUMULATE_STEPS
        base_lr = config.TRAIN.LR * batch_size
        optimizer_grouped_parameters = [{
            'params': [p for n, p in model.named_parameters() if _k in n],
            'lr':
            base_lr * _lr_mult
        } for _k, _lr_mult in config.TRAIN.LR_MULT]
        optimizer_grouped_parameters.append({
            'params': [
                p for n, p in model.named_parameters()
                if all([_k not in n for _k, _ in config.TRAIN.LR_MULT])
            ]
        })
        if config.TRAIN.OPTIMIZER == 'SGD':
            optimizer = optim.SGD(optimizer_grouped_parameters,
                                  lr=config.TRAIN.LR * batch_size,
                                  momentum=config.TRAIN.MOMENTUM,
                                  weight_decay=config.TRAIN.WD)
        elif config.TRAIN.OPTIMIZER == 'Adam':
            optimizer = optim.Adam(optimizer_grouped_parameters,
                                   lr=config.TRAIN.LR * batch_size,
                                   weight_decay=config.TRAIN.WD)
        elif config.TRAIN.OPTIMIZER == 'AdamW':
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=config.TRAIN.LR * batch_size,
                              betas=(0.9, 0.999),
                              eps=1e-6,
                              weight_decay=config.TRAIN.WD,
                              correct_bias=True)
        else:
            raise ValueError('Not support optimizer {}!'.format(
                config.TRAIN.OPTIMIZER))
        total_gpus = world_size

    else:
        #os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS
        model = eval(config.MODULE)(config)
        summary_parameters(model, logger)
        shutil.copy(args.cfg, final_output_path)
        shutil.copy(inspect.getfile(eval(config.MODULE)), final_output_path)
        num_gpus = len(config.GPUS.split(','))
        assert num_gpus <= 1 or (not config.TRAIN.FP16), "Not support fp16 with torch.nn.DataParallel. " \
                                                         "Please use amp.parallel.DistributedDataParallel instead."
        total_gpus = num_gpus
        rank = None
        writer = SummaryWriter(
            log_dir=args.log_dir) if args.log_dir is not None else None

        # model
        if num_gpus > 1:
            model = torch.nn.DataParallel(
                model,
                device_ids=[int(d) for d in config.GPUS.split(',')]).cuda()
        else:
            torch.cuda.set_device(int(config.GPUS))
            model.cuda()

        # loader
        train_loader = make_dataloader(config, mode='train', distributed=False)
        val_loader = make_dataloader(config, mode='val', distributed=False)
        train_sampler = None

        batch_size = num_gpus * (sum(config.TRAIN.BATCH_IMAGES) if isinstance(
            config.TRAIN.BATCH_IMAGES, list) else config.TRAIN.BATCH_IMAGES)
        if config.TRAIN.GRAD_ACCUMULATE_STEPS > 1:
            batch_size = batch_size * config.TRAIN.GRAD_ACCUMULATE_STEPS
        base_lr = config.TRAIN.LR * batch_size
        optimizer_grouped_parameters = [{
            'params': [p for n, p in model.named_parameters() if _k in n],
            'lr':
            base_lr * _lr_mult
        } for _k, _lr_mult in config.TRAIN.LR_MULT]
        optimizer_grouped_parameters.append({
            'params': [
                p for n, p in model.named_parameters()
                if all([_k not in n for _k, _ in config.TRAIN.LR_MULT])
            ]
        })

        if config.TRAIN.OPTIMIZER == 'SGD':
            optimizer = optim.SGD(optimizer_grouped_parameters,
                                  lr=config.TRAIN.LR * batch_size,
                                  momentum=config.TRAIN.MOMENTUM,
                                  weight_decay=config.TRAIN.WD)
        elif config.TRAIN.OPTIMIZER == 'Adam':
            optimizer = optim.Adam(optimizer_grouped_parameters,
                                   lr=config.TRAIN.LR * batch_size,
                                   weight_decay=config.TRAIN.WD)
        elif config.TRAIN.OPTIMIZER == 'AdamW':
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=config.TRAIN.LR * batch_size,
                              betas=(0.9, 0.999),
                              eps=1e-6,
                              weight_decay=config.TRAIN.WD,
                              correct_bias=True)
        else:
            raise ValueError('Not support optimizer {}!'.format(
                config.TRAIN.OPTIMIZER))

    # partial load pretrain state dict
    if config.NETWORK.PARTIAL_PRETRAIN != "":
        pretrain_state_dict = torch.load(
            config.NETWORK.PARTIAL_PRETRAIN,
            map_location=lambda storage, loc: storage)['state_dict']
        prefix_change = [
            prefix_change.split('->')
            for prefix_change in config.NETWORK.PARTIAL_PRETRAIN_PREFIX_CHANGES
        ]

        pretrain_state_dict_parsed = {}
        for k, v in pretrain_state_dict.items():
            no_match = True
            for pretrain_prefix, new_prefix in prefix_change:
                if k.startswith(pretrain_prefix):
                    k = new_prefix + k[len(pretrain_prefix):]
                    pretrain_state_dict_parsed[k] = v
                    no_match = False
                    break
            if no_match:
                pretrain_state_dict_parsed[k] = v
        if 'module.vlbert.relationsip_head.caption_image_relationship.weight' in pretrain_state_dict \
                and config.NETWORK.LOAD_REL_HEAD:
            pretrain_state_dict_parsed['module.final_mlp.1.weight'] \
                = pretrain_state_dict['module.vlbert.relationsip_head.caption_image_relationship.weight'][1:2].float() \
                - pretrain_state_dict['module.vlbert.relationsip_head.caption_image_relationship.weight'][0:1].float()
            pretrain_state_dict_parsed['module.final_mlp.1.bias'] \
                = pretrain_state_dict['module.vlbert.relationsip_head.caption_image_relationship.bias'][1:2].float() \
                  - pretrain_state_dict['module.vlbert.relationsip_head.caption_image_relationship.bias'][0:1].float()
        if config.NETWORK.PARTIAL_PRETRAIN_SEGMB_INIT:
            if isinstance(
                    pretrain_state_dict_parsed[
                        'module.vlbert._module.token_type_embeddings.weight'],
                    torch.HalfTensor):
                pretrain_state_dict_parsed['module.vlbert._module.token_type_embeddings.weight'] = \
                    pretrain_state_dict_parsed['module.vlbert._module.token_type_embeddings.weight'].float()
            pretrain_state_dict_parsed['module.vlbert._module.token_type_embeddings.weight'][1] = \
                pretrain_state_dict_parsed['module.vlbert._module.token_type_embeddings.weight'][0]
        pretrain_state_dict = pretrain_state_dict_parsed

        smart_partial_load_model_state_dict(model, pretrain_state_dict)

    # metrics
    train_metrics_list = [
        snlive_metrics.Accuracy(allreduce=args.dist,
                                num_replicas=world_size if args.dist else 1)
    ]
    val_metrics_list = [
        snlive_metrics.Accuracy(allreduce=args.dist,
                                num_replicas=world_size if args.dist else 1)
    ]

    for output_name, display_name in config.TRAIN.LOSS_LOGGERS:
        train_metrics_list.append(
            snlive_metrics.LossLogger(
                output_name,
                display_name=display_name,
                allreduce=args.dist,
                num_replicas=world_size if args.dist else 1))

    train_metrics = CompositeEvalMetric()
    val_metrics = CompositeEvalMetric()
    for child_metric in train_metrics_list:
        train_metrics.add(child_metric)
    for child_metric in val_metrics_list:
        val_metrics.add(child_metric)

    # epoch end callbacks
    epoch_end_callbacks = []
    if (rank is None) or (rank == 0):
        epoch_end_callbacks = [
            Checkpoint(model_prefix, config.CHECKPOINT_FREQUENT)
        ]
    validation_monitor = ValidationMonitor(
        do_validation,
        val_loader,
        val_metrics,
        host_metric_name='Acc',
        label_index_in_batch=config.DATASET.LABEL_INDEX_IN_BATCH)

    # optimizer initial lr before
    for group in optimizer.param_groups:
        group.setdefault('initial_lr', group['lr'])

    # resume/auto-resume
    if rank is None or rank == 0:
        smart_resume(model, optimizer, validation_monitor, config,
                     model_prefix, logger)
    if args.dist:
        begin_epoch = torch.tensor(config.TRAIN.BEGIN_EPOCH).cuda()
        distributed.broadcast(begin_epoch, src=0)
        config.TRAIN.BEGIN_EPOCH = begin_epoch.item()

    # batch end callbacks
    batch_size = len(config.GPUS.split(',')) * config.TRAIN.BATCH_IMAGES
    batch_end_callbacks = [
        Speedometer(batch_size,
                    config.LOG_FREQUENT,
                    batches_per_epoch=len(train_loader),
                    epochs=config.TRAIN.END_EPOCH - config.TRAIN.BEGIN_EPOCH)
    ]

    # setup lr step and lr scheduler

    if config.TRAIN.LR_SCHEDULE == 'plateau':
        print("Warning: not support resuming on plateau lr schedule!")
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='max',
            factor=config.TRAIN.LR_FACTOR,
            patience=1,
            verbose=True,
            threshold=1e-4,
            threshold_mode='rel',
            cooldown=2,
            min_lr=0,
            eps=1e-8)
    elif config.TRAIN.LR_SCHEDULE == 'triangle':
        lr_scheduler = WarmupLinearSchedule(
            optimizer,
            config.TRAIN.WARMUP_STEPS if config.TRAIN.WARMUP else 0,
            t_total=int(config.TRAIN.END_EPOCH * len(train_loader) /
                        config.TRAIN.GRAD_ACCUMULATE_STEPS),
            last_epoch=int(config.TRAIN.BEGIN_EPOCH * len(train_loader) /
                           config.TRAIN.GRAD_ACCUMULATE_STEPS) - 1)
    elif config.TRAIN.LR_SCHEDULE == 'step':
        lr_iters = [
            int(epoch * len(train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS)
            for epoch in config.TRAIN.LR_STEP
        ]
        lr_scheduler = WarmupMultiStepLR(
            optimizer,
            milestones=lr_iters,
            gamma=config.TRAIN.LR_FACTOR,
            warmup_factor=config.TRAIN.WARMUP_FACTOR,
            warmup_iters=config.TRAIN.WARMUP_STEPS
            if config.TRAIN.WARMUP else 0,
            warmup_method=config.TRAIN.WARMUP_METHOD,
            last_epoch=int(config.TRAIN.BEGIN_EPOCH * len(train_loader) /
                           config.TRAIN.GRAD_ACCUMULATE_STEPS) - 1)
    else:
        raise ValueError("Not support lr schedule: {}.".format(
            config.TRAIN.LR_SCHEDULE))

    # broadcast parameter and optimizer state from rank 0 before training start
    if args.dist:
        for v in model.state_dict().values():
            distributed.broadcast(v, src=0)
        # for v in optimizer.state_dict().values():
        #     distributed.broadcast(v, src=0)
        best_epoch = torch.tensor(validation_monitor.best_epoch).cuda()
        best_val = torch.tensor(validation_monitor.best_val).cuda()
        distributed.broadcast(best_epoch, src=0)
        distributed.broadcast(best_val, src=0)
        validation_monitor.best_epoch = best_epoch.item()
        validation_monitor.best_val = best_val.item()

    # apex: amp fp16 mixed-precision training
    if config.TRAIN.FP16:
        # model.apply(bn_fp16_half_eval)
        model, optimizer = amp.initialize(
            model,
            optimizer,
            opt_level='O2',
            keep_batchnorm_fp32=False,
            loss_scale=config.TRAIN.FP16_LOSS_SCALE,
            min_loss_scale=128.0)
        if args.dist:
            model = Apex_DDP(model, delay_allreduce=True)

    train(model,
          optimizer,
          lr_scheduler,
          train_loader,
          train_sampler,
          train_metrics,
          config.TRAIN.BEGIN_EPOCH,
          config.TRAIN.END_EPOCH,
          logger,
          rank=rank,
          batch_end_callbacks=batch_end_callbacks,
          epoch_end_callbacks=epoch_end_callbacks,
          writer=writer,
          validation_monitor=validation_monitor,
          fp16=config.TRAIN.FP16,
          clip_grad_norm=config.TRAIN.CLIP_GRAD_NORM,
          gradient_accumulate_steps=config.TRAIN.GRAD_ACCUMULATE_STEPS)

    return rank, model
Exemplo n.º 29
0
def main(args):

    init_process_group(backend='nccl')

    with open(args.config) as file:
        config = apply_dict(Dict, json.load(file))
    config.update(vars(args))
    config.update(
        dict(world_size=distributed.get_world_size(),
             global_rank=distributed.get_rank(),
             device_count=cuda.device_count(),
             local_rank=distributed.get_rank() % cuda.device_count()))
    print(f'config: {config}')

    backends.cudnn.benchmark = True
    backends.cudnn.fastest = True

    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    cuda.manual_seed(config.seed)
    cuda.set_device(config.local_rank)

    train_dataset = ImageNet(root=config.train_root,
                             meta=config.train_meta,
                             transform=transforms.Compose([
                                 transforms.RandomResizedCrop(224),
                                 transforms.RandomHorizontalFlip(),
                                 transforms.ColorJitter(brightness=0.4,
                                                        contrast=0.4,
                                                        saturation=0.4,
                                                        hue=0.2),
                                 transforms.ToTensor(),
                                 transforms.Normalize(mean=(0.485, 0.456,
                                                            0.406),
                                                      std=(0.229, 0.224,
                                                           0.225))
                             ]))
    val_dataset = ImageNet(root=config.val_root,
                           meta=config.val_meta,
                           transform=transforms.Compose([
                               transforms.Resize(256),
                               transforms.CenterCrop(224),
                               transforms.ToTensor(),
                               transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                                    std=(0.229, 0.224, 0.225)),
                           ]))

    train_sampler = utils.data.distributed.DistributedSampler(train_dataset)
    val_sampler = utils.data.distributed.DistributedSampler(val_dataset)

    train_data_loader = utils.data.DataLoader(
        dataset=train_dataset,
        batch_size=config.local_batch_size,
        sampler=train_sampler,
        num_workers=config.num_workers,
        pin_memory=True)
    val_data_loader = utils.data.DataLoader(dataset=val_dataset,
                                            batch_size=config.local_batch_size,
                                            sampler=val_sampler,
                                            num_workers=config.num_workers,
                                            pin_memory=True)

    model = SuperMobileNetV2(first_conv_param=Dict(in_channels=3,
                                                   out_channels=32,
                                                   kernel_size=3,
                                                   stride=2),
                             middle_conv_params=[
                                 Dict(in_channels=32,
                                      out_channels=16,
                                      expand_ratio_list=[3, 6],
                                      kernel_size_list=[3, 5],
                                      blocks=1,
                                      stride=1),
                                 Dict(in_channels=16,
                                      out_channels=24,
                                      expand_ratio_list=[3, 6],
                                      kernel_size_list=[3, 5],
                                      blocks=2,
                                      stride=2),
                                 Dict(in_channels=24,
                                      out_channels=32,
                                      expand_ratio_list=[3, 6],
                                      kernel_size_list=[3, 5],
                                      blocks=3,
                                      stride=2),
                                 Dict(in_channels=32,
                                      out_channels=64,
                                      expand_ratio_list=[3, 6],
                                      kernel_size_list=[3, 5],
                                      blocks=4,
                                      stride=2),
                                 Dict(in_channels=64,
                                      out_channels=96,
                                      expand_ratio_list=[3, 6],
                                      kernel_size_list=[3, 5],
                                      blocks=3,
                                      stride=1),
                                 Dict(in_channels=96,
                                      out_channels=160,
                                      expand_ratio_list=[3, 6],
                                      kernel_size_list=[3, 5],
                                      blocks=3,
                                      stride=2),
                                 Dict(in_channels=160,
                                      out_channels=320,
                                      expand_ratio_list=[3, 6],
                                      kernel_size_list=[3, 5],
                                      blocks=1,
                                      stride=1),
                             ],
                             last_conv_param=Dict(in_channels=320,
                                                  out_channels=1280,
                                                  kernel_size=1,
                                                  stride=1),
                             drop_prob=config.drop_prob,
                             num_classes=1000).cuda()

    for tensor in model.state_dict().values():
        distributed.broadcast(tensor, 0)

    criterion = CrossEntropyLoss(config.label_smoothing)

    config.global_batch_size = config.local_batch_size * config.world_size
    config.lr = config.lr * config.global_batch_size / config.global_batch_denom

    optimizer = torch.optim.RMSprop(params=model.weights(),
                                    lr=config.lr,
                                    alpha=config.alpha,
                                    eps=config.eps,
                                    weight_decay=config.weight_decay,
                                    momentum=config.momentum)
    lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer,
                                                  milestones=config.milestones,
                                                  gamma=config.gamma)

    last_epoch = -1
    global_step = 0
    if config.checkpoint:
        checkpoint = Dict(torch.load(config.checkpoint))
        model.load_state_dict(checkpoint.model_state_dict)
        optimizer.load_state_dict(checkpoint.optimizer_state_dict)
        last_epoch = checkpoint.last_epoch
        global_step = checkpoint.global_step
    elif config.global_rank == 0:
        if os.path.exists(config.checkpoint_directory):
            shutil.rmtree(config.checkpoint_directory)
        if os.path.exists(config.event_directory):
            shutil.rmtree(config.event_directory)
        os.makedirs(config.checkpoint_directory)
        os.makedirs(config.event_directory)

    if config.global_rank == 0:
        summary_writer = SummaryWriter(config.event_directory)

    if config.training:

        for epoch in range(last_epoch + 1, config.num_epochs):

            train_sampler.set_epoch(epoch)
            lr_scheduler.step(epoch)

            model.train()

            for local_step, (images, targets) in enumerate(train_data_loader):

                step_begin = time.time()

                images = images.cuda(non_blocking=True)
                targets = targets.cuda(non_blocking=True)

                logits = model(images)
                loss = criterion(logits, targets) / config.world_size

                optimizer.zero_grad()

                loss.backward()

                for parameter in model.parameters():
                    distributed.all_reduce(parameter.grad)

                optimizer.step()

                predictions = torch.argmax(logits, dim=1)
                accuracy = torch.mean(
                    (predictions == targets).float()) / config.world_size

                for tensor in [loss, accuracy]:
                    distributed.all_reduce(tensor)

                step_end = time.time()

                if config.global_rank == 0:
                    summary_writer.add_scalars(
                        main_tag='loss',
                        tag_scalar_dict=dict(train=loss),
                        global_step=global_step)
                    summary_writer.add_scalars(
                        main_tag='accuracy',
                        tag_scalar_dict=dict(train=accuracy),
                        global_step=global_step)
                    print(
                        f'[training] epoch: {epoch} global_step: {global_step} local_step: {local_step} '
                        f'loss: {loss:.4f} accuracy: {accuracy:.4f} [{step_end - step_begin:.4f}s]'
                    )

                global_step += 1

            if config.global_rank == 0:
                torch.save(
                    dict(model_state_dict=model.state_dict(),
                         optimizer_state_dict=optimizer.state_dict(),
                         last_epoch=epoch,
                         global_step=global_step),
                    f'{config.checkpoint_directory}/epoch_{epoch}')

            if config.validation:

                model.eval()

                with torch.no_grad():

                    average_loss = 0
                    average_accuracy = 0

                    for local_step, (images,
                                     targets) in enumerate(val_data_loader):

                        images = images.cuda(non_blocking=True)
                        targets = targets.cuda(non_blocking=True)

                        logits = model(images)
                        loss = criterion(logits, targets) / config.world_size

                        predictions = torch.argmax(logits, dim=1)
                        accuracy = torch.mean(
                            (predictions
                             == targets).float()) / config.world_size

                        for tensor in [loss, accuracy]:
                            distributed.all_reduce(tensor)

                        average_loss += loss
                        average_accuracy += accuracy

                    average_loss /= (local_step + 1)
                    average_accuracy /= (local_step + 1)

                if config.global_rank == 0:
                    summary_writer.add_scalars(
                        main_tag='loss',
                        tag_scalar_dict=dict(val=average_loss),
                        global_step=global_step)
                    summary_writer.add_scalars(
                        main_tag='accuracy',
                        tag_scalar_dict=dict(val=average_accuracy),
                        global_step=global_step)
                    print(
                        f'[validation] epoch: {epoch} loss: {average_loss:.4f} accuracy: {average_accuracy:.4f}'
                    )

    elif config.validation:

        model.eval()

        with torch.no_grad():

            average_loss = 0
            average_accuracy = 0

            for local_step, (images, targets) in enumerate(val_data_loader):

                images = images.cuda(non_blocking=True)
                targets = targets.cuda(non_blocking=True)

                logits = model(images)
                loss = criterion(logits, targets) / config.world_size

                predictions = torch.argmax(logits, dim=1)
                accuracy = torch.mean(
                    (predictions == targets).float()) / config.world_size

                for tensor in [loss, accuracy]:
                    distributed.all_reduce(tensor)

                average_loss += loss
                average_accuracy += accuracy

            average_loss /= (local_step + 1)
            average_accuracy /= (local_step + 1)

        if config.global_rank == 0:
            print(
                f'[validation] epoch: {last_epoch} loss: {average_loss:.4f} accuracy: {average_accuracy:.4f}'
            )

    if config.global_rank == 0:
        summary_writer.close()
Exemplo n.º 30
0
 def broadcast_initialized_params(self, src: int = 0):
     super().broadcast_initialized_params(src)
     distributed.broadcast(self.input_low, src)
     distributed.broadcast(self.input_range, src)
    def step(self, closure=None):



        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']

            all_grads = []

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                if self.compression_buffer==False:
                    if weight_decay != 0:
                        d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    # signum
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                    else:
                        buf = param_state['momentum_buffer']

                    buf.mul_(momentum).add_((1 - momentum),d_p)
                    d_p.copy_(buf)
                all_grads.append(d_p)

            dev_grads_buckets = _take_tensors(all_grads, self.bucket_size)
            for dev_grads in dev_grads_buckets:
                d_p_new = _flatten_dense_tensors(dev_grads)

                if self.all_reduce:
                    coded, data_time = QSGD_gpu.encode(d_p_new,self.enable_max)
                    tensor_decoded = QSGD_gpu.decode(coded, cuda = True)
                    dist.all_reduce(tensor_decoded, group = 0)
                    tensor_decoded = tensor_decoded / dist.get_world_size()
                    if self.bidirection_compress:
                        if dist.get_rank() == 0:
                            coded, data_time = QSGD_gpu.encode(tensor_decoded,self.enable_max)
                            tensor_decoded = QSGD_gpu.decode(coded, cuda = True)
                        else:
                            tensor_decoded = torch.zeros(tensor_decoded.size()).type_as(tensor_decoded)

                        dist.all_reduce(tensor_decoded, group = 0)
                        
                    d_p_new = tensor_decoded

                else:
                    if self.nodes > 1:
                        if self.compression_buffer:

                            coded, data_time = QSGD_gpu.encode(d_p_new,self.enable_max)
                            #specific coded dic just on CPU
                            tensor_signs = coded['signs']
                            tensor_selected = coded['selected']
                            tensor_norm = coded['norm']
                            #size
                            tensor_signs_size = self.pack_len_tensor_into_tensor(tensor_signs)
                            tensor_selected_size = self.pack_len_tensor_into_tensor(tensor_selected)
                            #tensor_norm_size = self.pack_len_tensor_into_tensor(tensor_norm) norm doesn't need size

                            #custom
                            '''
                            print(tensor_signs.type())
                            print(tensor_selected.type())
                            print(tensor_norm.type())
                            '''

                        else:
                            d_p_new = torch.sign(d_p_new)

                        if self.local_rank == 0:

                            if self.all_gather_commu:
                                #This version only for instances each with one GPU
                                for node_index in self.inter_node_list:
                                    if node_index != self.nodes_rank:

                                        d.set()
                                        
    
                                        coded_temp = coded.copy()
                                        
                                        b.set()
                                        tensor_signs_size_temp = tensor_signs_size.clone()
                                        dist.broadcast(tensor_signs_size_temp, node_index, group = self.all_inter_node_group)
                                        b.record()
                                        c.set()
                                        tensor_signs_temp = torch.zeros([int(tensor_signs_size_temp[0])], device = self.device, dtype=torch.int)
                                        print('tensor_signs_temp', tensor_signs_temp.size())
                                        c.record()
                                        a.set()
                                        dist.broadcast(tensor_signs_temp, node_index, group = self.all_inter_node_group)
                                        a.record()
                                        d.record()

                                        e.set()
                                        tensor_selected_size_temp = tensor_selected_size.clone()
                                        dist.broadcast(tensor_selected_size_temp, node_index, group = self.all_inter_node_group)
                                        tensor_selected_temp = torch.zeros([int(tensor_selected_size_temp[0])], device = self.device, dtype=torch.long)                             
                                        dist.broadcast(tensor_selected_temp, node_index, group = self.all_inter_node_group)
                                        print('tensor_selected_temp', tensor_selected_temp.size())
                                        e.record()
                                        f.set()
                                        tensor_norm_temp = tensor_norm.clone()
                                        dist.broadcast(tensor_norm_temp, node_index, group = self.all_inter_node_group)

                                        coded_temp['signs'] = tensor_signs_temp
                                        coded_temp['selected'] = tensor_selected_temp
                                        coded_temp['norm'] = tensor_norm_temp
                                        

                                        tensor_decoded = QSGD_gpu.decode(coded_temp, cuda = True)
                                        d_p_new = d_p_new + tensor_decoded
                                        f.record()

                                        
                                        print('a', a.get_time())
                                        print('b', b.get_time())
                                        print('c', c.get_time())
                                        print('d', d.get_time())
                                        print('e', e.get_time())
                                        print('f', f.get_time())
                                        

                                    else:
                                        dist.broadcast(tensor_signs_size, node_index, group = self.all_inter_node_group)
                                        dist.broadcast(tensor_signs, node_index, group = self.all_inter_node_group) 
                                        dist.broadcast(tensor_selected_size, node_index, group = self.all_inter_node_group)
                                        dist.broadcast(tensor_selected, node_index, group = self.all_inter_node_group) 
                                        dist.broadcast(tensor_norm, node_index, group = self.all_inter_node_group) 
                                d_p_new = d_p_new / dist.get_world_size()





                            else:
                                if dist.get_rank() == 0:
                                    for index, inter_node_group in enumerate(self.inter_node_group_list):
                                        coded_temp = coded.copy()

                                        tensor_signs_size_temp = tensor_signs_size.clone()
                                        dist.broadcast(tensor_signs_size_temp, self.inter_node_list[index + 1], group = inter_node_group)
                                        tensor_signs_temp = torch.randn([int(tensor_signs_size_temp[0])]).type_as(tensor_signs)
                                        dist.broadcast(tensor_signs_temp, self.inter_node_list[index + 1], group = inter_node_group)

                                        tensor_selected_size_temp = tensor_selected_size.clone()
                                        dist.broadcast(tensor_selected_size_temp, self.inter_node_list[index + 1], group = inter_node_group)
                                        tensor_selected_temp = torch.randn([int(tensor_selected_size_temp[0])]).type_as(tensor_selected)                             
                                        dist.broadcast(tensor_selected_temp, self.inter_node_list[index + 1], group = inter_node_group)

                                        tensor_norm_temp = tensor_norm.clone()
                                        dist.broadcast(tensor_norm_temp, self.inter_node_list[index + 1], group = inter_node_group)

                                        coded_temp['signs'] = tensor_signs_temp
                                        coded_temp['selected'] = tensor_selected_temp
                                        coded_temp['norm'] = tensor_norm_temp
                                        

                                        tensor_decoded = QSGD_gpu.decode(coded_temp, cuda = True)
                                        d_p_new = d_p_new + tensor_decoded


                                        '''
                                        #temp
                                        print(tensor_decoded)
                                        tensor_decoded_temp = tensor_decoded.clone()
                                        dist.broadcast(tensor_decoded_temp, self.inter_node_list[index + 1], group = inter_node_group)
                                        if tensor_decoded == tensor_decoded_temp:
                                            print('success')
                                        print(tensor_signs_size_temp)
                                        print(tensor_selected_size_temp)
                                        '''

                                    d_p_new = d_p_new / dist.get_world_size()

                                else:
                                    dist.broadcast(tensor_signs_size, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1])
                                    dist.broadcast(tensor_signs, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) 
                                    dist.broadcast(tensor_selected_size, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1])
                                    dist.broadcast(tensor_selected, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) 
                                    dist.broadcast(tensor_norm, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) 

                                    '''
                                    #temp
                                    tensor_decoded = QSGD_gpu.decode(coded, cuda = True)
                                    print(tensor_decoded)
                                    dist.broadcast(tensor_decoded, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) 
                                    print(tensor_signs_size)
                                    print(tensor_selected_size)
                                    '''

                                    dist.barrier(group = self.all_inter_node_group)

                                #os._exit()



                                if self.bidirection_compress:                
                                    if dist.get_rank() == 0:
                                        coded, data_time = QSGD_gpu.encode(d_p_new,self.enable_max)
                                        tensor_signs = coded['signs']
                                        tensor_selected = coded['selected']
                                        tensor_norm = coded['norm']

                                        tensor_signs_size = self.pack_len_tensor_into_tensor(tensor_signs)
                                        tensor_selected_size = self.pack_len_tensor_into_tensor(tensor_selected)

                                        dist.barrier(group = self.all_inter_node_group)

                                    dist.broadcast(tensor_signs_size, 0, group = self.all_inter_node_group)
                                    dist.broadcast(tensor_selected_size, 0, group = self.all_inter_node_group)
                                    if dist.get_rank() != 0:
                                        torch.cuda.synchronize()
                                        tensor_signs = torch.randn([int(tensor_signs_size[0])]).type_as(tensor_signs)
                                        tensor_selected = torch.randn([int(tensor_selected_size[0])]).type_as(tensor_selected)
                                        torch.cuda.synchronize()

                                    dist.barrier(group = self.all_inter_node_group)

                                    dist.broadcast(tensor_signs, 0, group = self.all_inter_node_group)
                                    dist.broadcast(tensor_selected, 0, group = self.all_inter_node_group)
                                    dist.broadcast(tensor_norm, 0, group = self.all_inter_node_group)

                                    coded['signs'] = tensor_signs
                                    coded['selected'] = tensor_selected
                                    coded['norm'] = tensor_norm

                                    tensor_decoded = QSGD_gpu.decode(coded, cuda = True)
                                    d_p_new = tensor_decoded

                                else:
                                    if dist.get_rank() == 0:
                                        dist.barrier(group = self.all_inter_node_group)
                                    dist.broadcast(d_p_new, 0, group = self.all_inter_node_group)

                    else:
                        # test for one
                        coded, data_time = QSGD_gpu.encode(d_p_new,self.enable_max)
                        tensor_decoded = QSGD_gpu.decode(coded, cuda = True)
                        d_p_new = tensor_decoded


                #unflatten
                dev_grads_new = _unflatten_dense_tensors(d_p_new,dev_grads)
                for grad, reduced in zip(dev_grads, dev_grads_new):
                    grad.copy_(reduced)
            for p in group['params']:
                if self.compression_buffer:
                    if weight_decay != 0:
                        p.grad.data.add_(weight_decay, p.data)
                p.data.add_(-group['lr'], p.grad.data)

        return loss
Exemplo n.º 32
0
    def __init__(self, module, device_ids=None, output_device=None, dim=0):
        super(DistributedDataParallel, self).__init__()

        if device_ids is None:
            device_ids = list(range(torch.cuda.device_count()))
        if output_device is None:
            output_device = device_ids[0]
        self.dim = dim
        self.module = module
        self.device_ids = device_ids
        self.output_device = output_device

        # Sync params and buffers
        for p in self.module.state_dict().values():
            dist.broadcast(p, 0)

        if len(device_ids) > 1:
            # TODO: we don't need to replicate params in here. they're always going to
            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
            # better to not pollute the caches with these small blocks
            self._module_copies = replicate(self.module, self.device_ids)
            self._module_copies[0] = self.module
            for module_copy in self._module_copies[1:]:
                for param, copy_param in zip(self.module.parameters(), module_copy.parameters()):
                    copy_param.detach_()
                    copy_param.requires_grad = param.requires_grad
        else:
            self._module_copies = [self.module]

        # Split parameters into buckets that will coalesce reductions
        # TODO: different types need different buckets
        t = None
        for p in self.module.parameters():
            tp = type(p.data)
            if t is not None and t is not tp:
                raise ValueError("DistributedDataParallel requires all parameters' data to be of the same type")
            t = tp

        self.bucket_sizes = []
        self.bucket_map = {}
        MB = 1024 * 1024
        self.broadcast_bucket_size = 10 * MB  # used for param sync before forward
        bucket_bytes_cap = 1 * MB
        bucket_bytes = bucket_bytes_cap  # to init the first bucket immediately
        for param_tuple in zip(*map(lambda m: m.parameters(), self._module_copies)):
            if bucket_bytes >= bucket_bytes_cap:
                self.bucket_sizes.append(0)
                bucket_bytes = 0
            self.bucket_sizes[-1] += 1
            for p in param_tuple:
                self.bucket_map[p] = len(self.bucket_sizes) - 1
            bucket_bytes += p.numel() * p.element_size()

        self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))]
        self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))]
        self.reduced = [False] * len(self.bucket_sizes)

        self._register_grad_hooks()

        self.dispatch_lock = threading.Lock()
        self._start_reduction_threads()
Exemplo n.º 33
0
 def broadcast_initialized_params(self, src: int = 0):
     super().broadcast_initialized_params(src)
     distributed.broadcast(self.scale, src=src)
     distributed.broadcast(self.signed_tensor, src=src)
Exemplo n.º 34
0
def train300_mlperf_coco(args):
    global torch
    from coco import COCO
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    args.distributed = False
    if use_cuda:
        try:
            from apex.parallel import DistributedDataParallel as DDP
            if 'WORLD_SIZE' in os.environ:
                args.distributed = int(os.environ['WORLD_SIZE']) > 1
        except:
            raise ImportError("Please install APEX from https://github.com/nvidia/apex")

    local_seed = args.seed
    if args.distributed:
        # necessary pytorch imports
        import torch.utils.data.distributed
        import torch.distributed as dist
        if args.no_cuda:
            device = torch.device('cpu')
        else:
            torch.cuda.set_device(args.local_rank)
            device = torch.device('cuda')
            dist.init_process_group(backend='nccl',
                                    init_method='env://')
            # set seeds properly
            args.seed = broadcast_seeds(args.seed, device)
            local_seed = (args.seed + dist.get_rank()) % 2**32
    mllogger.event(key=mllog_const.SEED, value=local_seed)
    torch.manual_seed(local_seed)
    np.random.seed(seed=local_seed)

    args.rank = dist.get_rank() if args.distributed else args.local_rank
    print("args.rank = {}".format(args.rank))
    print("local rank = {}".format(args.local_rank))
    print("distributed={}".format(args.distributed))

    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)

    input_size = 300
    train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False,
                                 num_cropping_iterations=args.num_cropping_iterations)
    val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True)

    val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")
    train_annotate = os.path.join(args.data, "annotations/instances_train2017.json")
    train_coco_root = os.path.join(args.data, "train2017")

    cocoGt = COCO(annotation_file=val_annotate)
    train_coco = COCODetection(train_coco_root, train_annotate, train_trans)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
    mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco))
    mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_coco)
    else:
        train_sampler = None
    train_dataloader = DataLoader(train_coco,
                                  batch_size=args.batch_size,
                                  shuffle=(train_sampler is None),
                                  sampler=train_sampler,
                                  num_workers=4)
    # set shuffle=True in DataLoader
    if args.rank==0:
        val_dataloader = DataLoader(val_coco,
                                    batch_size=args.val_batch_size or args.batch_size,
                                    shuffle=False,
                                    sampler=None,
                                    num_workers=4)
    else:
        val_dataloader = None

    ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone)
    if args.checkpoint is not None:
        print("loading model checkpoint", args.checkpoint)
        od = torch.load(args.checkpoint)
        ssd300.load_state_dict(od["model"])
    ssd300.train()
    if use_cuda:
        ssd300.cuda()
    loss_func = Loss(dboxes)
    if use_cuda:
        loss_func.cuda()
    if args.distributed:
        N_gpu = torch.distributed.get_world_size()
    else:
        N_gpu = 1

	# parallelize
    if args.distributed:
        ssd300 = DDP(ssd300)

    global_batch_size = N_gpu * args.batch_size
    mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size)
    # Reference doesn't support group batch norm, so bn_span==local_batch_size
    mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size)
    current_lr = args.lr * (global_batch_size / 32)

    assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits"
    fragment_size = args.batch_size // args.batch_splits
    if args.batch_splits != 1:
        print("using gradient accumulation with fragments of size {}".format(fragment_size))

    current_momentum = 0.9
    optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr,
                            momentum=current_momentum,
                            weight_decay=args.weight_decay)
    ssd_print(key=mllog_const.OPT_BASE_LR, value=current_lr)
    ssd_print(key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay)

    iter_num = args.iteration
    avg_loss = 0.0
    inv_map = {v:k for k,v in val_coco.label_map.items()}
    success = torch.zeros(1)
    if use_cuda:
        success = success.cuda()


    if args.warmup:
        nonempty_imgs = len(train_coco)
        wb = int(args.warmup * nonempty_imgs / (N_gpu*args.batch_size))
        ssd_print(key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb)
        warmup_step = lambda iter_num, current_lr: lr_warmup(optim, wb, iter_num, current_lr, args)
    else:
        warmup_step = lambda iter_num, current_lr: None

    ssd_print(key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor)
    ssd_print(key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule)
    mllogger.start(
        key=mllog_const.BLOCK_START,
        metadata={mllog_const.FIRST_EPOCH_NUM: 1,
                  mllog_const.EPOCH_COUNT: args.epochs})

    optim.zero_grad()
    for epoch in range(args.epochs):
        mllogger.start(
            key=mllog_const.EPOCH_START,
            metadata={mllog_const.EPOCH_NUM: epoch})
        # set the epoch for the sampler
        if args.distributed:
            train_sampler.set_epoch(epoch)

        if epoch in args.lr_decay_schedule:
            current_lr *= 0.1
            print("")
            print("lr decay step #{num}".format(num=args.lr_decay_schedule.index(epoch) + 1))
            for param_group in optim.param_groups:
                param_group['lr'] = current_lr

        for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader):
            current_batch_size = img.shape[0]
            # Split batch for gradient accumulation
            img = torch.split(img, fragment_size)
            bbox = torch.split(bbox, fragment_size)
            label = torch.split(label, fragment_size)

            for (fimg, fbbox, flabel) in zip(img, bbox, label):
                current_fragment_size = fimg.shape[0]
                trans_bbox = fbbox.transpose(1,2).contiguous()
                if use_cuda:
                    fimg = fimg.cuda()
                    trans_bbox = trans_bbox.cuda()
                    flabel = flabel.cuda()
                fimg = Variable(fimg, requires_grad=True)
                ploc, plabel = ssd300(fimg)
                gloc, glabel = Variable(trans_bbox, requires_grad=False), \
                               Variable(flabel, requires_grad=False)
                loss = loss_func(ploc, plabel, gloc, glabel)
                loss = loss * (current_fragment_size / current_batch_size) # weighted mean
                loss.backward()

            warmup_step(iter_num, current_lr)
            optim.step()
            optim.zero_grad()
            if not np.isinf(loss.item()): avg_loss = 0.999*avg_loss + 0.001*loss.item()
            if args.rank == 0 and args.log_interval and not iter_num % args.log_interval:
                print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\
                    .format(iter_num, loss.item(), avg_loss))
            iter_num += 1


        if (args.val_epochs and (epoch+1) in args.val_epochs) or \
           (args.val_interval and not (epoch+1) % args.val_interval):
            if args.distributed:
                world_size = float(dist.get_world_size())
                for bn_name, bn_buf in ssd300.module.named_buffers(recurse=True):
                    if ('running_mean' in bn_name) or ('running_var' in bn_name):
                        dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
                        bn_buf /= world_size
                        ssd_print(key=mllog_const.MODEL_BN_SPAN,
                            value=bn_buf)
            if args.rank == 0:
                if not args.no_save:
                    print("")
                    print("saving model...")
                    torch.save({"model" : ssd300.state_dict(), "label_map": train_coco.label_info},
                               "./models/iter_{}.pt".format(iter_num))

                if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map,
                             args.threshold, epoch + 1, iter_num,
                             log_interval=args.log_interval,
                             nms_valid_thresh=args.nms_valid_thresh):
                    success = torch.ones(1)
                    if use_cuda:
                        success = success.cuda()
            if args.distributed:
                dist.broadcast(success, 0)
            if success[0]:
                    return True
            mllogger.end(
                key=mllog_const.EPOCH_STOP,
                metadata={mllog_const.EPOCH_NUM: epoch})
    mllogger.end(
        key=mllog_const.BLOCK_STOP,
        metadata={mllog_const.FIRST_EPOCH_NUM: 1,
                  mllog_const.EPOCH_COUNT: args.epochs})

    return False
    def train_network(self,
                      current_progress,
                      overall_progress,
                      time_start,
                      time_end,
                      batch_size=24):

        path = Path(__file__).parents[1] / 'models' / 'train' / 'train_log.log'
        logging.basicConfig(
            format="[%(levelname)s] %(message)s",
            level=logging.INFO,
            filename=path,
            filemode='w+'
        )

        rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1
        plots = True  # as default
        adam = False
        # DDP parameter, do not modify
        local_rank = -1
        save_dir = Path(os.path.join(self.path, 'models'))
        weights_dir = os.path.join(save_dir, 'train')
        last = os.path.join(weights_dir, 'last.pt')
        best = os.path.join(weights_dir, 'best.pt')
        results_file = os.path.join(save_dir, 'results.txt')
        self.index_records(weights_dir)
        self.index_classes(self.path)
        total_batch_size = batch_size

        data = check_file(os.path.join(weights_dir, 'data.yaml'))
        cfg = check_file(os.path.join(weights_dir, 'cfg.yaml'))
        hyp = check_file(os.path.join(weights_dir, 'hyp.yaml'))

        cuda = self.device.type != 'cpu'
        init_seeds(2 + rank)
        with open(data) as f:
            data_dict = yaml.load(f, Loader=yaml.SafeLoader)
        with open(hyp) as f:
            hyp_dict = yaml.load(f, Loader=yaml.SafeLoader)
        logger.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp_dict.items()))
        with torch_distributed_zero_first(rank):
            check_dataset(data_dict)
        train_path = data_dict['train']
        test_path = data_dict['val']
        nc = data_dict['nc']
        epochs = 10000
        names = data_dict['names']
        assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, data)

        weights = 'models/train/last.pt'
        pretrained = weights.endswith('.pt') and self.model is not None
        if pretrained:
            with torch_distributed_zero_first(rank):
                attempt_download(weights)  # download if not found locally
            checkpoint = torch.load(weights, map_location=self.device)  # load checkpoint
            model = Model(cfg or checkpoint['model'].yaml, ch=3, nc=nc, anchors=hyp_dict.get('anchors')).to(self.device)
            exclude = ['anchor'] if cfg or hyp_dict.get('anchors') else []  # exclude keys
            state_dict = checkpoint['model'].float().state_dict()  # to FP32
            state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)  # intersect
            model.load_state_dict(state_dict, strict=False)  # load
            logger.info(
                'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))  # report
        else:
            model = Model(cfg, ch=3, nc=nc, anchors=hyp_dict.get('anchors')).to(self.device)  # create

        freeze = []
        for k, v in model.named_parameters():
            v.requires_grad = True
            if any(x in k for x in freeze):
                print('freezing %s' % k)
                v.requires_grad = False

        nbs = 64
        accumulate = max(round(nbs / total_batch_size), 1)
        hyp_dict['weight_decay'] *= total_batch_size * accumulate / nbs
        logger.info(f"Scaled weight_decay = {hyp_dict['weight_decay']}")

        pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
        for k, v in model.named_modules():
            if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
                pg2.append(v.bias)  # biases
            if isinstance(v, nn.BatchNorm2d):
                pg0.append(v.weight)  # no decay
            elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
                pg1.append(v.weight)  # apply decay

        # use adam optimizer, false as default
        if adam:
            # adjust beta1 to momentum
            optimizer = optim.Adam(pg0, lr=hyp_dict['lr0'], betas=(hyp_dict['momentum'], 0.999))
        else:
            optimizer = optim.SGD(pg0, lr=hyp_dict['lr0'], momentum=hyp_dict['momentum'], nesterov=True)
        # add pg1 with weight_decay
        optimizer.add_param_group({'params': pg1, 'weight_decay': hyp_dict['weight_decay']})
        optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
        logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
        del pg0, pg1, pg2

        # Scheduler https://arxiv.org/pdf/1812.01187.pdf
        # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
        # false as default
        linear_lr = False
        if linear_lr:
            lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp_dict['lrf']) + hyp_dict['lrf']  # linear
        else:
            lf = one_cycle(1, hyp_dict['lrf'], epochs)
        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)

        start_epoch, best_fitness = 0, 0.0

        # Image sizes
        gs = int(model.stride.max())  # grid size (max stride)
        nl = model.model[-1].nl  # number of detection layers (used for scaling hyp_dict['obj'])
        # verify imgsz are gs-multiples
        imgsz, imgsz_test = [check_img_size(x, gs) for x in [self.image_size, self.image_size]]

        # DP mode
        if cuda and rank == -1 and torch.cuda.device_count() > 1:
            model = torch.nn.DataParallel(model)

        # SyncBatchNorm, false as default
        sync_bn = False
        if sync_bn and cuda and rank != -1:
            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(self.device)
            logger.info('Using SyncBatchNorm()')

        # EMA
        ema = ModelEMA(model) if rank in [-1, 0] else None

        # DDP mode
        if cuda and rank != -1:
            model = DDP(model, device_ids=[local_rank], output_device=local_rank)

        # Trainloader
        dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, False,
                                                hyp=hyp_dict, rank=rank,
                                                prefix=colorstr('train: '),
                                                workers=0)
        mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
        nb = len(dataloader)  # number of batches
        overall_progress.value = nb * epochs
        assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, data, nc - 1)

        if rank in [-1, 0]:
            ema.updates = start_epoch * nb // accumulate  # set EMA updates
            testloader = create_dataloader(test_path, imgsz_test, batch_size * 2, gs, False,
                                           hyp=hyp_dict, rect=True, rank=-1,
                                           pad=0.5, prefix=colorstr('val: '))[0]

            labels = np.concatenate(dataset.labels, 0)
            c = torch.tensor(labels[:, 0])  # classes

            if plots:
                plot_labels(labels, save_dir)

                # Anchors
                check_anchors(dataset, model=model, thr=hyp_dict['anchor_t'], imgsz=imgsz)
            model.half().float()

        # Model parameters
        hyp_dict['box'] *= 3. / nl  # scale to layers
        hyp_dict['cls'] *= nc / 80. * 3. / nl  # scale to classes and layers
        hyp_dict['obj'] *= (imgsz / 640) ** 2 * 3. / nl  # scale to image size and layers
        model.nc = nc  # attach number of classes to model
        model.hyp = hyp_dict  # attach hyperparameters to model
        model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)
        model.class_weights = labels_to_class_weights(dataset.labels, nc).to(self.device) * nc  # attach class weights
        model.names = names

        # Start training
        time_start.value = time.time()
        # number of warmup iterations, max(3 epochs, 1k iterations)
        nw = max(round(hyp_dict['warmup_epochs'] * nb), 1000)
        maps = np.zeros(nc)  # mAP per class
        results = (0, 0, 0, 0, 0, 0, 0)  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
        scheduler.last_epoch = start_epoch - 1  # do not move
        scaler = amp.GradScaler(enabled=cuda)
        compute_loss = ComputeLoss(model)  # init loss class
        logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n'
                    f'Using {dataloader.num_workers} dataloader workers\n'
                    f'Logging results to {save_dir}\n'
                    f'Starting training for {epochs} epochs...')

        for epoch in range(start_epoch, epochs):  # epoch -----------------------
            model.train()

            # Update image weights (optional)
            image_weights = False
            if image_weights:
                # Generate indices
                if rank in [-1, 0]:
                    cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc  # class weights
                    iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)  # image weights
                    dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)  # rand weighted idx
                # Broadcast if DDP
                if rank != -1:
                    indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int()
                    dist.broadcast(indices, 0)
                    if rank != 0:
                        dataset.indices = indices.cpu().numpy()

            mloss = torch.zeros(4, device=self.device)  # mean losses
            if rank != -1:
                dataloader.sampler.set_epoch(epoch)
            pbar = enumerate(dataloader)
            logger.info('%10s' * 8 % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size'))
            if rank in [-1, 0]:
                pbar = tqdm(pbar, total=nb)  # progress bar
            optimizer.zero_grad()
            for i, (imgs, targets, paths, _) in pbar:  # batch ---------------------------
                ni = i + nb * epoch  # number integrated batches (since train start)
                current_progress.value = ni

                imgs = imgs.to(self.device, non_blocking=True).float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0

                # Warmup
                if ni <= nw:
                    xi = [0, nw]  # x interp
                    # model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
                    accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                    for j, x in enumerate(optimizer.param_groups):
                        # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                        x['lr'] = np.interp(ni, xi,
                                            [hyp_dict['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                        if 'momentum' in x:
                            x['momentum'] = np.interp(ni, xi, [hyp_dict['warmup_momentum'], hyp_dict['momentum']])

                # Multi-scale
                multi_scale = False
                if multi_scale:
                    sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # size
                    sf = sz / max(imgs.shape[2:])  # scale factor
                    if sf != 1:
                        ns = [math.ceil(x * sf / gs) * gs for x in
                              imgs.shape[2:]]  # new shape (stretched to gs-multiple)
                        imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)

                # Forward
                with amp.autocast(enabled=cuda):
                    pred = model(imgs)  # forward
                    loss, loss_items = compute_loss(pred, targets.to(self.device))  # loss scaled by batch_size
                    if rank != -1:
                        # gradient averaged between devices in DDP mode
                        loss *= int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1

                # Backward
                scaler.scale(loss).backward()

                # Optimize
                if ni % accumulate == 0:
                    scaler.step(optimizer)  # optimizer.step
                    scaler.update()
                    optimizer.zero_grad()
                    if ema:
                        ema.update(model)

                # Print
                if rank in [-1, 0]:
                    mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
                    mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
                    s = ('%10s' * 2 + '%10.4g' * 6) % (
                        '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1])
                    pbar.set_description(s)
                    logger.info(s)

                    # Plot
                    if plots and ni < 3:
                        f = save_dir / f'train_batch{ni}.jpg'  # filename
                        Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()

                # end batch -----------------------------------------------------
            # end epoch ---------------------------------------------------------

            # Scheduler
            lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboard
            scheduler.step()

            # DDP process 0 or single-GPU
            if rank in [-1, 0]:
                # mAP
                if ema:
                    ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights'])
                final_epoch = epoch + 1 == epochs
                if final_epoch or True:  # Calculate mAP
                    results, maps, times = test.test(data,
                                                     batch_size=batch_size * 2,
                                                     imgsz=imgsz_test,
                                                     model=ema.ema,
                                                     single_cls=False,
                                                     dataloader=testloader,
                                                     save_dir=save_dir,
                                                     verbose=nc < 50 and final_epoch,
                                                     plots=plots and final_epoch,
                                                     log_imgs=0,
                                                     compute_loss=compute_loss)

                # Write
                with open(results_file, 'w+') as f:
                    f.write(s + '%10.4g' * 7 % results + '\n')  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
                bucket = ''
                if bucket:
                    os.system('gsutil cp %s gs://results/results.txt' % results_file)

                # Update best mAP
                fi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, [email protected], [email protected]]
                if fi > best_fitness:
                    best_fitness = fi

                # Save model
                save = True
                if save:
                    with open(results_file, 'r') as f:  # create checkpoint
                        checkpoint = {
                            'epoch': epoch,
                            'best_fitness': best_fitness,
                            'training_results': f.read(),
                            'model': ema.ema,
                            'optimizer': None if final_epoch else optimizer.state_dict(),
                            'wandb_id': None
                        }

                    # Save last, best and delete
                    if current_progress.value % 100:
                        torch.save(checkpoint, last)
                    if best_fitness == fi:
                        torch.save(checkpoint, best)
                    del checkpoint
            # end epoch ---------------------------------------------------------
        # end training
        time_end.value = time.time() - time_start.value
        self.load_network()
Exemplo n.º 36
0
def broadcast_module_itr(args, module: torch.nn.Module, source=0):
    group = dist.new_group(list(range(args.num_subnet)))
    for para in module.parameters():
        dist.broadcast(para.data, src=source, group=group, async_op=False)
    dist.destroy_process_group(group)
Exemplo n.º 37
0
 def sync_parameters(self):
     for param in self.module.parameters():
         dist.broadcast(param.data, 0)
Exemplo n.º 38
0
def cluster_memory(model,
                   local_memory_index,
                   local_memory_embeddings,
                   size_dataset,
                   nmb_kmeans_iters=10):
    j = 0
    assignments = -100 * torch.ones(len(args.nmb_prototypes),
                                    size_dataset).long()
    with torch.no_grad():
        for i_K, K in enumerate(args.nmb_prototypes):
            # run distributed k-means

            # init centroids with elements from memory bank of rank 0
            centroids = torch.empty(K, args.feat_dim).cuda(non_blocking=True)
            if args.rank == 0:
                random_idx = torch.randperm(len(
                    local_memory_embeddings[j]))[:K]
                assert len(
                    random_idx) >= K, "please reduce the number of centroids"
                centroids = local_memory_embeddings[j][random_idx]
            dist.broadcast(centroids, 0)

            for n_iter in range(nmb_kmeans_iters + 1):

                # E step
                dot_products = torch.mm(local_memory_embeddings[j],
                                        centroids.t())
                _, local_assignments = dot_products.max(dim=1)

                # finish
                if n_iter == nmb_kmeans_iters:
                    break

                # M step
                where_helper = get_indices_sparse(
                    local_assignments.cpu().numpy())
                counts = torch.zeros(K).cuda(non_blocking=True).int()
                emb_sums = torch.zeros(K,
                                       args.feat_dim).cuda(non_blocking=True)
                for k in range(len(where_helper)):
                    if len(where_helper[k][0]) > 0:
                        emb_sums[k] = torch.sum(
                            local_memory_embeddings[j][where_helper[k][0]],
                            dim=0,
                        )
                        counts[k] = len(where_helper[k][0])
                dist.all_reduce(counts)
                mask = counts > 0
                dist.all_reduce(emb_sums)
                centroids[mask] = emb_sums[mask] / counts[mask].unsqueeze(1)

                # normalize centroids
                centroids = nn.functional.normalize(centroids, dim=1, p=2)

            getattr(model.module.prototypes,
                    "prototypes" + str(i_K)).weight.copy_(centroids)

            # gather the assignments
            assignments_all = torch.empty(args.world_size,
                                          local_assignments.size(0),
                                          dtype=local_assignments.dtype,
                                          device=local_assignments.device)
            assignments_all = list(assignments_all.unbind(0))
            dist_process = dist.all_gather(assignments_all,
                                           local_assignments,
                                           async_op=True)
            dist_process.wait()
            assignments_all = torch.cat(assignments_all).cpu()

            # gather the indexes
            indexes_all = torch.empty(args.world_size,
                                      local_memory_index.size(0),
                                      dtype=local_memory_index.dtype,
                                      device=local_memory_index.device)
            indexes_all = list(indexes_all.unbind(0))
            dist_process = dist.all_gather(indexes_all,
                                           local_memory_index,
                                           async_op=True)
            dist_process.wait()
            indexes_all = torch.cat(indexes_all).cpu()

            # log assignments
            assignments[i_K][indexes_all] = assignments_all

            # next memory bank to use
            j = (j + 1) % len(args.crops_for_assign)

    return assignments
Exemplo n.º 39
0
    def __init__(self, module, device_ids=None, output_device=None, dim=0):
        super(DistributedDataParallel, self).__init__()

        if device_ids is None:
            device_ids = list(range(torch.cuda.device_count()))
        if output_device is None:
            output_device = device_ids[0]
        self.dim = dim
        self.module = module
        self.device_ids = device_ids
        self.output_device = output_device

        # Sync params and buffers
        for p in self.module.state_dict().values():
            dist.broadcast(p, 0)

        if len(device_ids) > 1:
            # TODO: we don't need to replicate params in here. they're always going to
            # be broadcasted using larger blocks in broadcast_coalesce, so it might be
            # better to not pollute the caches with these small blocks
            self._module_copies = replicate(self.module, self.device_ids)
            self._module_copies[0] = self.module
            for module_copy in self._module_copies[1:]:
                for param, copy_param in zip(self.module.parameters(), module_copy.parameters()):
                    copy_param.detach_()
                    copy_param.requires_grad = param.requires_grad
        else:
            self._modules_copies = [self.module]

        # Split parameters into buckets that will coalesce reductions
        # TODO: different types need different buckets
        t = None
        for p in self.module.parameters():
            tp = type(p.data)
            if t is not None and t is not tp:
                raise ValueError("DistributedDataParallel requires all parameters' data to be of the same type")
            t = tp

        self.bucket_sizes = []
        self.bucket_map = {}
        MB = 1024 * 1024
        self.broadcast_bucket_size = 10 * MB  # used for param sync before forward
        bucket_bytes_cap = 1 * MB
        bucket_bytes = bucket_bytes_cap  # to init the first bucket immediately
        for param_tuple in zip(*map(lambda m: m.parameters(), self._module_copies)):
            if bucket_bytes >= bucket_bytes_cap:
                self.bucket_sizes.append(0)
                bucket_bytes = 0
            self.bucket_sizes[-1] += 1
            for p in param_tuple:
                self.bucket_map[p] = len(self.bucket_sizes) - 1
            bucket_bytes += p.numel() * p.element_size()

        self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))]
        self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))]
        self.reduced = [False] * len(self.bucket_sizes)

        self._register_grad_hooks()

        self.dispatch_lock = threading.Lock()
        self._start_reduction_threads()
Exemplo n.º 40
0
def run(args):
    device = torch.device(
        'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
    torch.manual_seed(1234)

    logging.info(f"{args.rank}-th worker starts.")

    read_start = time.time()

    f_id_start = args.rank * args.num_files
    f_id_end = f_id_start + args.num_files
    f_path_list = [
        "{}/{}".format(args.root, i) for i in range(f_id_start, f_id_end)
    ]
    f = open(f_path_list[0]).readlines()
    dataset = DenseLibsvmDataset(f, args.features, args.pos_tag)
    if len(f_path_list) > 1:
        for file_name in f_path_list[1:]:
            f = open(file_name).readlines()
            dataset.add_more(f)

    total_count = dataset.__len__()
    pos_count = 0
    for i in range(total_count):
        if dataset.__getitem__(i)[1] == 1:
            pos_count += 1
    print("{} positive observations out of {}".format(pos_count, total_count))

    train_set = np.array(dataset.ins_list)

    dt = train_set.dtype
    centroid_shape = (args.num_clusters, train_set.shape[1])
    logging.info(f"Loading dataset costs {time.time() - read_start}s")
    logging.info(f"centorid shape: {centroid_shape}")

    # initialize centroids
    init_cent_start = time.time()
    if args.rank == 0:
        centroids = torch.tensor(train_set[0:args.num_clusters])
    else:
        centroids = torch.empty(args.num_clusters, args.features)

    if dist_is_initialized():
        dist.broadcast(centroids, 0)
    logging.info(
        f"Receiving initial centroids costs {time.time() - init_cent_start}s")

    training_start = time.time()
    avg_error = np.iinfo(np.int16).max
    for epoch in range(args.epochs):
        if avg_error >= args.threshold:
            start_compute = time.time()
            model = Kmeans(train_set,
                           centroids,
                           avg_error,
                           centroid_type='tensor')
            model.find_nearest_cluster()
            end_compute = time.time()
            #logging.info(f"{args.rank}-th worker computing centroids takes {end_compute - start_compute}s")
            sync_start = time.time()
            if dist_is_initialized():
                centroids, avg_error = broadcast_average(
                    args, model.get_centroids("dense_tensor"),
                    torch.tensor(model.error))
            logging.info(f"{args.rank}-th worker finished {epoch} epoch. "
                         f"Computing takes {end_compute - start_compute}s."
                         f"Communicating takes {time.time() - sync_start}s. "
                         #f"Centroids: {model.get_centroids('dense_tensor')}. "
                         f"Loss: {model.error}")
        else:
            logging.info(
                f"{args.rank}-th worker finished training. Error = {avg_error}, centroids = {centroids}"
            )
            logging.info(
                f"Whole process time : {time.time() - training_start}")
            return