Пример #1
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.is_distributed:
        print(
            "INFO:PyTorch: Initialize process group for distributed training")
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        distributed.init_process_group(backend=args.dist_backend,
                                       init_method=args.dist_url,
                                       world_size=args.world_size,
                                       rank=args.rank)

    if args.gpu is not None:
        if not args.evaluate:
            print(
                "INFO:PyTorch: Use GPU: {} for training, the rank of this GPU is {}"
                .format(args.gpu, args.rank))
        else:
            print(
                "INFO:PyTorch: Use GPU: {} for evaluating, the rank of this GPU is {}"
                .format(args.gpu, args.rank))

    # set the name of the process
    setproctitle.setproctitle(args.proc_name + '_rank{}'.format(args.rank))
    if not args.multiprocessing_distributed or \
     (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
        # define tensorboard summary
        val_writer = SummaryWriter(log_dir=os.path.join(args.model_dir, 'val'))

    # define loss function (criterion) and optimizer
    if args.is_label_smoothing:
        criterion = label_smoothing.label_smoothing_CE(reduction='mean')
    else:
        criterion = nn.CrossEntropyLoss()

    # create model
    if args.pretrained:
        model_info = "INFO:PyTorch: using pre-trained model '{}'".format(
            args.arch)
    else:
        model_info = "INFO:PyTorch: creating model '{}'".format(args.arch)

    print(model_info)
    model = splitnet.SplitNet(args,
                              norm_layer=norm.norm(args.norm_mode),
                              criterion=criterion)

    # print the number of parameters in the model
    print("INFO:PyTorch: The number of parameters in the model is {}".format(
        metric.get_the_number_of_params(model)))
    if args.is_summary:
        summary_choice = 0
        if summary_choice == 0:
            summary.summary(model,
                            torch.rand((1, 3, args.crop_size, args.crop_size)),
                            target=torch.ones(1, dtype=torch.long))
        else:
            flops, params = profile(model,
                                    inputs=(torch.rand((1, 3, args.crop_size,
                                                        args.crop_size)),
                                            torch.ones(1, dtype=torch.long),
                                            'summary'))
            print(clever_format([flops, params], "%.4f"))
        return None

    if args.is_distributed:
        if args.world_size > 1 and args.is_syncbn:
            print(
                "INFO:PyTorch: convert torch.nn.BatchNormND layer in the model to torch.nn.SyncBatchNorm layer"
            )
            # only single gpu per process is currently supported
            model = nn.SyncBatchNorm.convert_sync_batchnorm(model)

        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu], find_unused_parameters=True)
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)

    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)

    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # optimizer
    param_groups = model.parameters(
    ) if args.is_wd_all else lr_scheduler.get_parameter_groups(model)
    if args.is_wd_all:
        print(
            "INFO:PyTorch: Applying weight decay to all learnable parameters in the model."
        )

    if args.optimizer == 'SGD':
        print("INFO:PyTorch: using SGD optimizer.")
        optimizer = torch.optim.SGD(
            param_groups,
            args.lr,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
            nesterov=True if args.is_nesterov else False)
    elif args.optimizer == "AdamW":
        print("INFO:PyTorch: using AdamW optimizer.")
        optimizer = torch.optim.AdamW(param_groups,
                                      lr=args.lr,
                                      betas=(0.9, 0.999),
                                      eps=1e-4,
                                      weight_decay=args.weight_decay)

    elif args.optimizer == "RMSprop":
        # See efficientNet at https://github.com/tensorflow/tpu/
        print("INFO:PyTorch: using RMSprop optimizer.")
        optimizer = torch.optim.RMSprop(param_groups,
                                        lr=args.lr,
                                        alpha=0.9,
                                        weight_decay=args.weight_decay,
                                        momentum=0.9)

    elif args.optimizer == "RMSpropTF":
        # https://github.com/rwightman/pytorch-image-models/blob/fcb6258877/timm/optim/rmsprop_tf.py
        print("INFO:PyTorch: using RMSpropTF optimizer.")
        optimizer = rmsprop_tf.RMSpropTF(param_groups,
                                         lr=args.lr,
                                         alpha=0.9,
                                         eps=0.001,
                                         weight_decay=args.weight_decay,
                                         momentum=0.9,
                                         decoupled_decay=False)
    else:
        raise NotImplementedError

    # PyTorch AMP loss scaler
    scaler = None if not args.is_amp else amp.GradScaler()

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("INFO:PyTorch: => loading checkpoint '{}'".format(
                args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)

            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            """
			if args.gpu is not None:
				# best_acc1 may be from a checkpoint from a different GPU
				best_acc1 = best_acc1.to(args.gpu)
			"""
            model.load_state_dict(checkpoint['state_dict'])
            print("INFO:PyTorch: Loading state_dict of optimizer")
            optimizer.load_state_dict(checkpoint['optimizer'])

            if "scaler" in checkpoint:
                print("INFO:PyTorch: Loading state_dict of AMP loss scaler")
                scaler.load_state_dict(checkpoint['scaler'])

            print("INFO:PyTorch: => loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("INFO:PyTorch: => no checkpoint found at '{}'".format(
                args.resume))

    # accelarate the training
    torch.backends.cudnn.benchmark = True

    # Data loading code
    data_split_factor = args.loop_factor if args.is_diff_data_train else 1
    print("INFO:PyTorch: => The number of views of train data is '{}'".format(
        data_split_factor))
    train_loader, train_sampler = factory.get_data_loader(
        args.data,
        split_factor=data_split_factor,
        batch_size=args.batch_size,
        crop_size=args.crop_size,
        dataset=args.dataset,
        split="train",
        is_distributed=args.is_distributed,
        is_autoaugment=args.is_autoaugment,
        randaa=args.randaa,
        is_cutout=args.is_cutout,
        erase_p=args.erase_p,
        num_workers=args.workers)
    val_loader = factory.get_data_loader(args.data,
                                         batch_size=args.eval_batch_size,
                                         crop_size=args.crop_size,
                                         dataset=args.dataset,
                                         split="val",
                                         num_workers=args.workers)
    # learning rate scheduler
    scheduler = lr_scheduler.lr_scheduler(
        mode=args.lr_mode,
        init_lr=args.lr,
        num_epochs=args.epochs,
        iters_per_epoch=len(train_loader),
        lr_milestones=args.lr_milestones,
        lr_step_multiplier=args.lr_step_multiplier,
        slow_start_epochs=args.slow_start_epochs,
        slow_start_lr=args.slow_start_lr,
        end_lr=args.end_lr,
        multiplier=args.lr_multiplier,
        decay_factor=args.decay_factor,
        decay_epochs=args.decay_epochs,
        staircase=True)

    if args.evaluate:
        validate(val_loader, model, args)
        return None

    saved_ckpt_filenames = []

    streams = None
    # streams = [torch.cuda.Stream() for i in range(args.loop_factor)]

    for epoch in range(args.start_epoch, args.epochs + 1):
        if args.is_distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train(train_loader,
              model,
              optimizer,
              scheduler,
              epoch,
              args,
              streams,
              scaler=scaler)

        if (epoch + 1) % args.eval_per_epoch == 0:
            # evaluate on validation set
            acc_all = validate(val_loader, model, args)

            # remember best acc@1 and save checkpoint
            is_best = acc_all[0] > best_acc1
            best_acc1 = max(acc_all[0], best_acc1)

            # save checkpoint
            if not args.multiprocessing_distributed or \
             (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
                # summary per epoch
                val_writer.add_scalar('avg_acc1',
                                      acc_all[0],
                                      global_step=epoch)
                if args.dataset == 'imagenet':
                    val_writer.add_scalar('avg_acc5',
                                          acc_all[1],
                                          global_step=epoch)

                for i in range(2, args.loop_factor + 2):
                    val_writer.add_scalar('{}_acc1'.format(i - 1),
                                          acc_all[i],
                                          global_step=epoch)

                val_writer.add_scalar('learning_rate',
                                      optimizer.param_groups[0]['lr'],
                                      global_step=epoch)
                val_writer.add_scalar('best_acc1',
                                      best_acc1,
                                      global_step=epoch)

                # save checkpoints
                filename = "checkpoint_{0}.pth.tar".format(epoch)
                saved_ckpt_filenames.append(filename)
                # remove the oldest file if the number of saved ckpts is greater than args.max_ckpt_nums
                if len(saved_ckpt_filenames) > args.max_ckpt_nums:
                    os.remove(
                        os.path.join(args.model_dir,
                                     saved_ckpt_filenames.pop(0)))

                ckpt_dict = {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }

                if args.is_amp:
                    ckpt_dict['scaler'] = scaler.state_dict()

                metric.save_checkpoint(ckpt_dict,
                                       is_best,
                                       args.model_dir,
                                       filename=filename)

    # clean GPU cache
    torch.cuda.empty_cache()
    sys.exit(0)
Пример #2
0
def multistreams_test(args):
    """
	This is a simple program for validating the idea of parallel runing of multiple
	model on single gpu via multi cuda streams.
	"""
    model = splitnet.SplitNet(args,
                              norm_layer=norm.norm(args.norm_mode),
                              criterion=None)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("INFO:PyTorch: => loading checkpoint '{}'".format(
                args.resume))
            checkpoint = torch.load(args.resume)
            old_dict = checkpoint['state_dict']
            # orignial ckpt was save as nn.parallel.DistributedDataParallel() object
            old_dict = {
                k.replace("module.models", "models"): v
                for k, v in old_dict.items()
            }

            model.load_state_dict(old_dict)
            print("INFO:PyTorch: => loaded checkpoint"
                  " '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
        else:
            print("INFO:PyTorch: => no checkpoint found at '{}'".format(
                args.resume))

    # accelarate the training
    torch.backends.cudnn.benchmark = True

    val_loader = factory.get_data_loader(args.data,
                                         batch_size=args.eval_batch_size,
                                         crop_size=args.crop_size,
                                         dataset=args.dataset,
                                         split="val",
                                         num_workers=args.workers)
    # record the top1 accuray of each small network
    top1_all = []
    for i in range(args.loop_factor):
        top1_all.append(metric.AverageMeter('{}_Acc@1'.format(i), ':6.2f'))
    avg_top1 = metric.AverageMeter('Avg_Acc@1', ':6.2f')
    avg_top5 = metric.AverageMeter('Avg_Acc@1', ':6.2f')
    progress = metric.ProgressMeter(len(val_loader),
                                    *top1_all,
                                    avg_top1,
                                    avg_top5,
                                    prefix='Test: ')

    # switch to evaluate mode
    model.eval()
    # move model to the gpu
    cuda_models = []
    cuda_streams = []
    for idx in range(args.split_factor):
        cuda_streams.append(torch.cuda.Stream())
        cuda_models.append(model.models[idx].cuda(0))
    torch.cuda.synchronize()

    # record time and number of samples
    n_count = 0.0
    start_time = time.time()

    with torch.no_grad():
        for i, (images, target) in enumerate(val_loader):
            images = images.cuda(0, non_blocking=True)
            target = target.cuda(0, non_blocking=True)
            collect_outputs = []

            if args.is_amp:
                with torch.cuda.stream(cuda_streams[0]):
                    with amp.autocast():
                        output_0 = cuda_models[0](images)

                with torch.cuda.stream(cuda_streams[1]):
                    with amp.autocast():
                        output_1 = cuda_models[1](images)

            else:
                for idx in range(args.split_factor):
                    with torch.cuda.stream(cuda_streams[idx]):
                        collect_outputs.append(cuda_models[idx](images))
            torch.cuda.synchronize()

            collect_outputs.extend([output_0, output_1])
            # output is fp16
            outputs = torch.stack(collect_outputs, dim=0)
            ensemble_output = torch.mean(outputs, dim=0)

            # measure accuracy and record loss
            batch_size_now = images.size(0)
            n_count += batch_size_now
            for j in range(args.loop_factor):
                acc1, acc5 = metric.accuracy(outputs[j, ...],
                                             target,
                                             topk=(1, 5))
                top1_all[j].update(acc1[0].item(), batch_size_now)

            # simply average outputs of small networks
            avg_acc1, avg_acc5 = metric.accuracy(ensemble_output,
                                                 target,
                                                 topk=(1, 5))
            avg_top1.update(avg_acc1[0].item(), batch_size_now)
            avg_top5.update(avg_acc5[0].item(), batch_size_now)

            #if i >= 200:
            #	break

            if i % args.print_freq == 0:
                progress.print(i)

        time_cnt = time.time() - start_time

        # print accuracy info
        acc_all = []
        acc_all.append(avg_top1.avg)
        acc_all.append(avg_top5.avg)
        acc_info = '* Acc@1 {:.3f} Acc@5 {:.3f}'.format(acc_all[0], acc_all[1])
        mean_acc = 0.0
        for j in range(args.loop_factor):
            acc_all.append(top1_all[j].avg)
            acc_info += '\t {}_acc@1 {:.3f}'.format(j, top1_all[j].avg)
            mean_acc += top1_all[j].avg
        acc_info += "\t avg_acc {:.3f}".format(mean_acc / args.split_factor)
        print(acc_info)

    print("The tested architecture is {} with split_factor {}".format(
        args.arch, args.split_factor))
    print("The number of the samples is {}".format(n_count))
    print("The total testing time is {} second".format(time_cnt))
    print("The average test time is {}ms per images".format(1000 * time_cnt /
                                                            n_count))

    torch.cuda.empty_cache()
    sys.exit(0)
Пример #3
0
def multigpu_test_2gpus(args):
    """
	This is a simple program for validating the idea of parallel runing of multiple
	model on multiple gpus.
	"""
    model = splitnet.SplitNet(args,
                              norm_layer=norm.norm(args.norm_mode),
                              criterion=None)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("INFO:PyTorch: => loading checkpoint '{}'".format(
                args.resume))
            checkpoint = torch.load(args.resume)
            old_dict = checkpoint['state_dict']
            # orignial ckpt was save as nn.parallel.DistributedDataParallel() object
            old_dict = {
                k.replace("module.models", "models"): v
                for k, v in old_dict.items()
            }
            model.load_state_dict(old_dict)
            print("INFO:PyTorch: => loaded checkpoint"
                  " '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
        else:
            print("INFO:PyTorch: => no checkpoint found at '{}'".format(
                args.resume))

    # accelarate the training
    torch.backends.cudnn.benchmark = True

    val_loader = factory.get_data_loader(args.data,
                                         batch_size=args.eval_batch_size,
                                         crop_size=args.crop_size,
                                         dataset=args.dataset,
                                         split="val",
                                         num_workers=args.workers)
    # record the top1 accuray of each small network
    top1_all = []
    for i in range(args.loop_factor):
        top1_all.append(metric.AverageMeter('{}_Acc@1'.format(i), ':6.2f'))
    avg_top1 = metric.AverageMeter('Avg_Acc@1', ':6.2f')
    avg_top5 = metric.AverageMeter('Avg_Acc@1', ':6.2f')
    progress = metric.ProgressMeter(len(val_loader),
                                    *top1_all,
                                    avg_top1,
                                    avg_top5,
                                    prefix='Test: ')

    # switch to evaluate mode
    model.eval()
    # move model to the gpu
    if args.is_test_on_multigpus:
        print("INFO:PyTorch: multi GPUs test")
        cuda_models = []
        for idx in range(args.split_factor):
            cuda_models.append(model.models[idx].cuda(idx))
    else:
        print("INFO:PyTorch: single GPU test")
        model = model.cuda(0)

    with torch.no_grad():
        # record time and number of samples
        prefetcher = data_prefetcher_2gpus(val_loader, ngpus=args.split_factor)
        images_gpu0, target, images_gpu1 = prefetcher.next()
        i = 0
        n_count = 0.0
        start_time = time.time()

        while images_gpu0 is not None:
            i += 1
            # for i, (images, target) in enumerate(val_loader):
            # compute outputs and losses
            if args.is_test_on_multigpus:
                if args.is_amp:
                    with amp.autocast():
                        output_gpu0 = cuda_models[0](images_gpu0)
                    with amp.autocast():
                        output_gpu1 = cuda_models[1](images_gpu1)
                else:
                    output_gpu0 = cuda_models[0](images_gpu0)
                    output_gpu1 = cuda_models[1](images_gpu1)

                if _GEO_TEST:
                    if i == 1:
                        print("using geometry mean")
                    output_gpu0 = F.softmax(output_gpu0, dim=-1)
                    output_gpu1 = F.softmax(output_gpu1, dim=-1)
                    ensemble_output = torch.sqrt(output_gpu0 *
                                                 output_gpu1.cuda(0))
                else:
                    outputs = torch.stack([output_gpu0, output_gpu1.cuda(0)])
                    ensemble_output = torch.mean(outputs, dim=0)

            else:
                # compute outputs and losses
                if args.is_amp:
                    with amp.autocast():
                        ensemble_output, outputs, ce_loss = model(
                            images_gpu0, target=target, mode='val')
                else:
                    ensemble_output, outputs, ce_loss = model(images_gpu0,
                                                              target=target,
                                                              mode='val')

            # measure accuracy and record loss
            """
			target = target.cpu()
			ensemble_output = ensemble_output.cpu().float()
			outputs = outputs.cpu().float()
			"""

            batch_size_now = images_gpu0.size(0)
            """
			for j in range(args.loop_factor):
				acc1, acc5 = metric.accuracy(outputs[j, ...], target, topk=(1, 5))
				top1_all[j].update(acc1[0].item(), batch_size_now)
			"""
            # simply average outputs of small networks
            avg_acc1, avg_acc5 = metric.accuracy(ensemble_output,
                                                 target,
                                                 topk=(1, 5))
            avg_top1.update(avg_acc1[0].item(), batch_size_now)
            avg_top5.update(avg_acc5[0].item(), batch_size_now)

            images_gpu0, target, images_gpu1 = prefetcher.next()

            n_count += batch_size_now
            """
			if i % args.print_freq == 0:
				progress.print(i)
			"""
        time_cnt = time.time() - start_time
        # print accuracy info
        acc_all = []
        acc_all.append(avg_top1.avg)
        acc_all.append(avg_top5.avg)
        acc_info = '* Acc@1 {:.3f} Acc@5 {:.3f}'.format(acc_all[0], acc_all[1])
        """
		mean_acc = 0.0
		for j in range(args.loop_factor):
			acc_all.append(top1_all[j].avg)
			acc_info += '\t {}_acc@1 {:.3f}'.format(j, top1_all[j].avg)
			mean_acc += top1_all[j].avg
		acc_info += "\t avg_acc {:.3f}".format(mean_acc / args.split_factor)
		"""
        print(acc_info)

    print("multiple GPUs ({})".format(args.is_test_on_multigpus))
    print("The tested architecture is {} with split_factor {}".format(
        args.arch, args.split_factor))
    print("The number of the samples is {}".format(n_count))
    print("The total testing time is {} second".format(time_cnt))
    print("The average test time is {}ms per images".format(1000 * time_cnt /
                                                            n_count))

    torch.cuda.empty_cache()
    sys.exit(0)
Пример #4
0
def multigpu_test(args):
    """
	This is a simple program for validating the idea of parallel runing of multiple
	model on multiple gpus.
	"""
    model = splitnet.SplitNet(args,
                              norm_layer=norm.norm(args.norm_mode),
                              criterion=None)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("INFO:PyTorch: => loading checkpoint '{}'".format(
                args.resume))
            checkpoint = torch.load(args.resume)
            old_dict = checkpoint['state_dict']
            # orignial ckpt was save as nn.parallel.DistributedDataParallel() object
            old_dict = {
                k.replace("module.models", "models"): v
                for k, v in old_dict.items()
            }
            model.load_state_dict(old_dict)
            print("INFO:PyTorch: => loaded checkpoint"
                  " '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
        else:
            print("INFO:PyTorch: => no checkpoint found at '{}'".format(
                args.resume))

    # accelarate the training
    torch.backends.cudnn.benchmark = True

    val_loader = factory.get_data_loader(args.data,
                                         batch_size=args.eval_batch_size,
                                         crop_size=args.crop_size,
                                         dataset=args.dataset,
                                         split="val",
                                         num_workers=args.workers)
    # record the top1 accuray of each small network
    top1_all = []
    for i in range(args.loop_factor):
        top1_all.append(metric.AverageMeter('{}_Acc@1'.format(i), ':6.2f'))
    avg_top1 = metric.AverageMeter('Avg_Acc@1', ':6.2f')
    avg_top5 = metric.AverageMeter('Avg_Acc@1', ':6.2f')
    progress = metric.ProgressMeter(len(val_loader),
                                    *top1_all,
                                    avg_top1,
                                    avg_top5,
                                    prefix='Test: ')

    # switch to evaluate mode
    model.eval()
    n_count = 0.0

    # move model to the gpu
    cuda_models = []
    for idx in range(args.split_factor):
        cuda_models.append(model.models[idx].cuda(idx))
    start_time = time.time()

    for i, (images, target) in enumerate(val_loader):
        cuda_images = []
        cuda_outpouts = []
        collect_outputs = []
        target = target.cuda(0, non_blocking=True)
        for idx in range(args.split_factor):
            cuda_images.append(images.cuda(idx, non_blocking=True))

        if args.is_amp:
            with amp.autocast():
                for idx in range(args.split_factor):
                    cuda_outpouts.append(cuda_models[idx](cuda_images[idx]))
        else:
            for idx in range(args.split_factor):
                cuda_outpouts.append(cuda_models[idx](cuda_images[idx]))

        for idx in range(args.split_factor):
            # use the first gpu as host gpu
            collect_outputs.append(cuda_outpouts[idx].cuda(0))

        if _GEO_TEST:
            if i == 1:
                print("using geometry mean")
            cmul = 1.0
            for j in range(args.split_factor):
                cmul = cmul * F.softmax(cuda_outpouts[j].cuda(0), dim=-1)
            # ensemble_output = torch.pow(cmul, 1.0 / args.split_factor)
            ensemble_output = torch.sqrt(cmul)
        else:
            outputs = torch.stack(collect_outputs, dim=0)
            ensemble_output = torch.mean(outputs, dim=0)

        batch_size_now = images.size(0)
        """
		for j in range(args.loop_factor):
			acc1, acc5 = metric.accuracy(outputs[j, ...], target, topk=(1, 5))
			top1_all[j].update(acc1[0].item(), batch_size_now)
		"""
        # simply average outputs of small networks
        avg_acc1, avg_acc5 = metric.accuracy(ensemble_output,
                                             target,
                                             topk=(1, 5))
        avg_top1.update(avg_acc1[0].item(), batch_size_now)
        avg_top5.update(avg_acc5[0].item(), batch_size_now)

        n_count += batch_size_now
        """
		if i % args.print_freq == 0:
			progress.print(i)
		"""
    time_cnt = time.time() - start_time
    # print accuracy info
    acc_all = []
    acc_all.append(avg_top1.avg)
    acc_all.append(avg_top5.avg)
    acc_info = '* Acc@1 {:.3f} Acc@5 {:.3f}'.format(acc_all[0], acc_all[1])
    """
	mean_acc = 0.0
	for j in range(args.loop_factor):
		acc_all.append(top1_all[j].avg)
		acc_info += '\t {}_acc@1 {:.3f}'.format(j, top1_all[j].avg)
		mean_acc += top1_all[j].avg
	acc_info += "\t avg_acc {:.3f}".format(mean_acc / args.split_factor)
	"""
    print(acc_info)

    print("multiple GPUs ({})".format(args.is_test_on_multigpus))
    print("The tested architecture is {} with split_factor {}".format(
        args.arch, args.split_factor))
    print("The number of the samples is {}".format(n_count))
    print("The total testing time is {} second".format(time_cnt))
    print("The average test time is {}ms per images".format(1000 * time_cnt /
                                                            n_count))

    torch.cuda.empty_cache()
    sys.exit(0)