def get_bn(config): if config.use_sync_bn: group_size = config.kwargs.group_size var_mode = config.kwargs.var_mode if group_size == 1: bn_group = None else: world_size, rank = link.get_world_size(), link.get_rank() assert world_size % group_size == 0 bn_group = simple_group_split(world_size, rank, world_size // group_size) del config.kwargs['group_size'] config.kwargs.group = bn_group config.kwargs.var_mode = (link.syncbnVarMode_t.L1 if var_mode == 'L1' else link.syncbnVarMode_t.L2) def BNFunc(*args, **kwargs): return link.nn.SyncBatchNorm2d(*args, **kwargs, **config.kwargs) return BNFunc else: def BNFunc(*args, **kwargs): return torch.nn.BatchNorm2d(*args, **kwargs, **config.kwargs) return BNFunc
def optim_entry(config): rank = link.get_rank() if config['type'] == 'FusedFP16SGD' and FusedFP16SGD is None: raise RuntimeError( 'FusedFP16SGD is disabled due to linklink version, try using other optimizers' ) if config['type'] == 'FusedFP16SGD' and rank > 0: config['kwargs']['verbose'] = False return globals()[config['type']](**config['kwargs'])
def get_logger(name, level=logging.INFO): global _logger_names logger = logging.getLogger(name) if name in _logger_names: return logger _logger_names.append(name) if link.get_rank() > 0: logger.addFilter(RankFilter()) return logger
def initialize(): process_id = int(os.environ['SLURM_PROCID'] if 'SLURM_PROCID' in os.environ else 0) num_gpu = torch.cuda.device_count() if num_gpu > 0: torch.cuda.set_device(process_id % num_gpu) link.initialize() rank = link.get_rank() world_size = link.get_world_size() return rank, world_size
def dist_init(): proc_id = int(os.environ['SLURM_PROCID']) ntasks = int(os.environ['SLURM_NTASKS']) node_list = os.environ['SLURM_NODELIST'] num_gpus = torch.cuda.device_count() torch.cuda.set_device(proc_id%num_gpus) link.initialize() world_size = link.get_world_size() rank = link.get_rank() return rank, world_size
def _batch_unshuffle_ddp(self, x, idx_unshuffle): # gather from all gpus batch_size_this = x.shape[0] x_gather = concat_all_gather(x, self.group_size, self.group_idx) batch_size_all = x_gather.shape[0] num_gpus = batch_size_all // batch_size_this # restored index for this gpu gpu_idx = link.get_rank() % self.group_size idx_this = idx_unshuffle.view(num_gpus, -1)[gpu_idx] return x_gather[idx_this]
def _get_group(bn_group_size): if bn_group_size in GroupSyncBatchNorm.group_by_size: return GroupSyncBatchNorm.group_by_size[bn_group_size] rank = get_rank() world_size = get_world_size() if bn_group_size is None: bn_group_size = world_size assert world_size % bn_group_size == 0 bn_group_comm = simple_group_split(world_size, rank, world_size // bn_group_size) GroupSyncBatchNorm.group_by_size[bn_group_size] = bn_group_comm return bn_group_comm
def __init__(self, encoder_q, encoder_k, K=65536, m=0.999, T=0.07, mlp=False, group_size=8): """ K: queue size; number of negative keys (default: 65536) m: moco momentum of updating key encoder (default: 0.999) T: softmax temperature (default: 0.07) group_size: size of the group to use ShuffleBN (default: 8, shuffle data across all gpus) """ super(MoCo, self).__init__() self.K = K self.m = m self.T = T dim = encoder_q.num_classes self.encoder_q = encoder_q self.encoder_k = encoder_k if mlp: # hack: brute-force replacement dim_mlp = self.encoder_q.fc.weight.shape[1] self.encoder_q.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_q.fc) self.encoder_k.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_k.fc) for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): param_k.data.copy_(param_q.data) # initialize param_k.requires_grad = False # not update by gradient # create the queue self.register_buffer("queue", torch.randn(dim, K)) self.queue = nn.functional.normalize(self.queue, dim=0) self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long)) rank = link.get_rank() world_size = link.get_world_size() self.group_size = world_size if group_size is None else min( world_size, group_size) assert world_size % self.group_size == 0 self.group_idx = simple_group_split(world_size, rank, world_size // self.group_size)
def dist_init(method='slurm', device_id=0): if method == 'slurm': proc_id = int(os.environ['SLURM_PROCID']) # ntasks = int(os.environ['SLURM_NTASKS']) # node_list = os.environ['SLURM_NODELIST'] num_gpus = torch.cuda.device_count() torch.cuda.set_device(proc_id % num_gpus) elif method == 'single_node': torch.cuda.set_device(device_id) link.initialize() world_size = link.get_world_size() rank = link.get_rank() return rank, world_size
def __init__(self, dataset, world_size=None, rank=None, round_up=True): if world_size is None: world_size = link.get_world_size() if rank is None: rank = link.get_rank() self.dataset = dataset self.world_size = world_size self.rank = rank self.round_up = round_up self.epoch = 0 self.num_samples = int( math.ceil(len(self.dataset) * 1.0 / self.world_size)) if self.round_up: self.total_size = self.num_samples * self.world_size else: self.total_size = len(self.dataset)
def _serialize_to_tensor(data, group=None): # backend = link.get_backend(group) # assert backend in ["gloo", "nccl"] # device = torch.device("cpu" if backend == "gloo" else "cuda") device = torch.cuda.current_device() buffer = pickle.dumps(data) if len(buffer) > 1024**3: import logging logger = logging.getLogger('global') logger.warning( "Rank {} trying to all-gather {:.2f} GB of data on device {}". format(link.get_rank(), len(buffer) / (1024**3), device)) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to(device=device) return tensor
def _batch_shuffle_ddp(self, x): # gather from all gpus batch_size_this = x.shape[0] x_gather = concat_all_gather(x, self.group_size, self.group_idx) batch_size_all = x_gather.shape[0] num_gpus = batch_size_all // batch_size_this # random shuffle index idx_shuffle = torch.randperm(batch_size_all).cuda() # broadcast to all gpus link.broadcast(idx_shuffle, 0, group_idx=self.group_idx) # index for restoring idx_unshuffle = torch.argsort(idx_shuffle) # shuffled index for this gpu gpu_idx = link.get_rank() % self.group_size idx_this = idx_shuffle.view(num_gpus, -1)[gpu_idx] return x_gather[idx_this], idx_unshuffle
def setup_env(self): # dist self.dist = EasyDict() self.dist.rank, self.dist.world_size = link.get_rank( ), link.get_world_size() self.prototype_info.world_size = self.dist.world_size # directories self.path = EasyDict() self.path.root_path = os.path.dirname(self.config_file) self.path.save_path = os.path.join(self.path.root_path, 'checkpoints') self.path.event_path = os.path.join(self.path.root_path, 'events') self.path.result_path = os.path.join(self.path.root_path, 'results') makedir(self.path.save_path) makedir(self.path.event_path) makedir(self.path.result_path) # tb_logger if self.dist.rank == 0: self.tb_logger = SummaryWriter(self.path.event_path) # logger create_logger(os.path.join(self.path.root_path, 'log.txt')) self.logger = get_logger(__name__) self.logger.info(f'config: {pprint.pformat(self.config)}') if 'SLURM_NODELIST' in os.environ: self.logger.info(f"hostnames: {os.environ['SLURM_NODELIST']}") # load pretrain checkpoint if hasattr(self.config.saver, 'pretrain'): self.state = torch.load(self.config.saver.pretrain.path, 'cpu') self.logger.info( f"Recovering from {self.config.saver.pretrain.path}, keys={list(self.state.keys())}" ) if hasattr(self.config.saver.pretrain, 'ignore'): self.state = modify_state(self.state, self.config.saver.pretrain.ignore) else: self.state = {} self.state['last_iter'] = 0 # others torch.backends.cudnn.benchmark = True
def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=0): if world_size is None: world_size = link.get_world_size() if rank is None: rank = link.get_rank() assert rank < world_size self.dataset = dataset self.total_iter = total_iter self.batch_size = batch_size self.world_size = world_size self.rank = rank self.last_iter = last_iter self.total_size = self.total_iter * self.batch_size self.indices = self.gen_new_list() self.call = 0
def _setup_env(self): # distribution information self.dist = EasyDict() self.dist.rank, self.dist.world_size = link.get_rank( ), link.get_world_size() # directories self.path = EasyDict() self.path.root_path = self.work_dir self.path.save_path = os.path.join(self.path.root_path, 'checkpoints') self.path.event_path = os.path.join(self.path.root_path, 'events') self.path.result_path = os.path.join(self.path.root_path, 'results') makedir(self.path.save_path) makedir(self.path.event_path) makedir(self.path.result_path) # create tensorboard logger if self.dist.rank == 0: self.tb_logger = SummaryWriter(self.path.event_path) # create logger create_logger(os.path.join(self.path.root_path, 'log.txt')) self.logger = get_logger(__name__) self.logger.info(f'config: {pprint.pformat(self.config)}') self.logger.info(f"hostnames: {os.environ['SLURM_NODELIST']}") # others torch.backends.cudnn.benchmark = True
def send_info(info): PrototypeINFO = {"name": "Prototype", "version": __version__} PrototypeINFO.update(info) if link.get_rank() == 0: t = threading.Thread(target=send, args=(PrototypeINFO, )) t.start()
def validate(val_loader, model, fusion_list=None, fuse_prob=False): batch_time = AverageMeter(0) losses = AverageMeter(0) top1 = AverageMeter(0) top5 = AverageMeter(0) # switch to evaluate mode if fusion_list is not None: model_list = [] for i in range(len(fusion_list)): model_list.append(model_entry(config.model)) model_list[i].cuda() model_list[i] = DistModule(model_list[i], args.sync) load_state(fusion_list[i], model_list[i]) model_list[i].eval() if fuse_prob: softmax = nn.Softmax(dim=1) else: model.eval() rank = link.get_rank() world_size = link.get_world_size() logger = logging.getLogger('global_logger') criterion = nn.CrossEntropyLoss() end = time.time() with torch.no_grad(): for i, (input, target) in enumerate(val_loader): input = input.cuda() if not args.fp16 else input.half().cuda() target = target.cuda() # compute output if fusion_list is not None: output_list = [] for model_idx in range(len(fusion_list)): output = model_list[model_idx](input) if fuse_prob: output = softmax(output) output_list.append(output) output = torch.stack(output_list, 0) output = torch.mean(output, 0) else: output = model(input) # measure accuracy and record loss loss = criterion( output, target ) #/ world_size ## loss should not be scaled here, it's reduced later! prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) num = input.size(0) losses.update(loss.item(), num) top1.update(prec1.item(), num) top5.update(prec5.item(), num) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0 and rank == 0: logger.info( 'Test: [{0}/{1}]\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})' .format(i, len(val_loader), batch_time=batch_time)) # gather final results total_num = torch.Tensor([losses.count]) loss_sum = torch.Tensor([losses.avg * losses.count]) top1_sum = torch.Tensor([top1.avg * top1.count]) top5_sum = torch.Tensor([top5.avg * top5.count]) link.allreduce(total_num) link.allreduce(loss_sum) link.allreduce(top1_sum) link.allreduce(top5_sum) final_loss = loss_sum.item() / total_num.item() final_top1 = top1_sum.item() / total_num.item() final_top5 = top5_sum.item() / total_num.item() if rank == 0: logger.info( ' * Prec@1 {:.3f}\tPrec@5 {:.3f}\tLoss {:.3f}\ttotal_num={}'. format(final_top1, final_top5, final_loss, total_num.item())) model.train() return final_loss, final_top1, final_top5
def train(train_loader, val_loader, model, optimizer, lr_scheduler, start_iter, tb_logger): global best_prec1 batch_time = AverageMeter(config.print_freq) fw_time = AverageMeter(config.print_freq) bp_time = AverageMeter(config.print_freq) sy_time = AverageMeter(config.print_freq) step_time = AverageMeter(config.print_freq) data_time = AverageMeter(config.print_freq) losses = AverageMeter(config.print_freq) top1 = AverageMeter(config.print_freq) top5 = AverageMeter(config.print_freq) # switch to train mode model.train() world_size = link.get_world_size() rank = link.get_rank() logger = logging.getLogger('global_logger') end = time.time() label_smooth = config.get('label_smooth', 0.0) if label_smooth > 0: logger.info('using label_smooth: {}'.format(label_smooth)) criterion = LabelSmoothCELoss(label_smooth, 1000) else: criterion = nn.CrossEntropyLoss() for i, (input, target) in enumerate(train_loader): curr_step = start_iter + i lr_scheduler.step(curr_step) current_lr = lr_scheduler.get_lr()[0] # measure data loading time data_time.update(time.time() - end) # transfer input to gpu target = target.cuda() input = input.cuda() if not args.fp16 else input.cuda().half() # forward output = model(input) loss = criterion(output, target) / world_size # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) reduced_loss = loss.clone() reduced_prec1 = prec1.clone() / world_size reduced_prec5 = prec5.clone() / world_size link.allreduce(reduced_loss) link.allreduce(reduced_prec1) link.allreduce(reduced_prec5) losses.update(reduced_loss.item()) top1.update(reduced_prec1.item()) top5.update(reduced_prec5.item()) # backward optimizer.zero_grad() if isinstance(optimizer, FusedFP16SGD): optimizer.backward(loss) reduce_gradients(model, args.sync) optimizer.step() elif isinstance(optimizer, FP16SGD): def closure(): # backward optimizer.backward(loss, False) # sync gradients reduce_gradients(model, args.sync) # check overflow, convert to fp32 grads, downscale optimizer.update_master_grads() return loss optimizer.step(closure) else: loss.backward() reduce_gradients(model, args.sync) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) if curr_step % config.print_freq == 0 and rank == 0: tb_logger.add_scalar('loss_train', losses.avg, curr_step) tb_logger.add_scalar('acc1_train', top1.avg, curr_step) tb_logger.add_scalar('acc5_train', top5.avg, curr_step) tb_logger.add_scalar('lr', current_lr, curr_step) logger.info('Iter: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t' 'LR {lr:.4f}'.format(curr_step, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=current_lr)) if curr_step > 0 and curr_step % config.val_freq == 0: val_loss, prec1, prec5 = validate(val_loader, model) if not tb_logger is None: tb_logger.add_scalar('loss_val', val_loss, curr_step) tb_logger.add_scalar('acc1_val', prec1, curr_step) tb_logger.add_scalar('acc5_val', prec5, curr_step) if rank == 0: # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'step': curr_step, 'arch': config.model.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, config.save_path + '/ckpt') end = time.time()
def train(train_loader, val_loader, model, optimizer, lr_scheduler, start_iter, tb_logger): global best_prec1 batch_time = AverageMeter(config.print_freq) fw_time = AverageMeter(config.print_freq) bp_time = AverageMeter(config.print_freq) sy_time = AverageMeter(config.print_freq) step_time = AverageMeter(config.print_freq) data_time = AverageMeter(config.print_freq) losses = AverageMeter(config.print_freq) top1 = AverageMeter(config.print_freq) top5 = AverageMeter(config.print_freq) # switch to train mode model.train() world_size = link.get_world_size() rank = link.get_rank() logger = logging.getLogger('global_logger') end = time.time() label_smooth = config.get('label_smooth', 0.0) if label_smooth > 0: logger.info('using label_smooth: {}'.format(label_smooth)) criterion = LabelSmoothCELoss(label_smooth, 1000) else: criterion = nn.CrossEntropyLoss() T_min, T_max = args.Tmin, args.Tmax # print (T_min, T_max) def Log_UP(K_min, K_max, ITEMS, ALL_ITEMS): Kmin, Kmax = math.log(K_min) / math.log(10), math.log(K_max) / math.log(10) return torch.tensor([math.pow(10, Kmin + (Kmax - Kmin) / ALL_ITEMS * ITEMS)]).float().cuda() # print (model) TIME = time.time() for i, (input, target) in enumerate(train_loader): curr_step = start_iter + i lr_scheduler.step(curr_step) current_lr = lr_scheduler.get_lr()[0] if (curr_step % config.print_freq == 0): t = Log_UP(T_min, T_max, curr_step, len(train_loader)) if (t < 1): k = 1 / t else: k = torch.tensor([1]).float().cuda() for i in range(3): model.module.layer1[i].conv1.k = k model.module.layer1[i].conv2.k = k model.module.layer1[i].conv1.t = t model.module.layer1[i].conv2.t = t for i in range(4): model.module.layer2[i].conv1.k = k model.module.layer2[i].conv2.k = k model.module.layer2[i].conv1.t = t model.module.layer2[i].conv2.t = t for i in range(6): model.module.layer3[i].conv1.k = k model.module.layer3[i].conv2.k = k model.module.layer3[i].conv1.t = t model.module.layer3[i].conv2.t = t for i in range(3): model.module.layer4[i].conv1.k = k model.module.layer4[i].conv2.k = k model.module.layer4[i].conv1.t = t model.module.layer4[i].conv2.t = t # print ('current k {:.5e} current t {:.5e}'.format(k[0], t[0])) # measure data loading time data_time.update(time.time() - end) # transfer input to gpu target = target.cuda() input = input.cuda() if not args.fp16 else input.cuda().half() # forward output = model(input) loss = criterion(output, target) / world_size # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) reduced_loss = loss.clone() reduced_prec1 = prec1.clone() / world_size reduced_prec5 = prec5.clone() / world_size link.allreduce(reduced_loss) link.allreduce(reduced_prec1) link.allreduce(reduced_prec5) losses.update(reduced_loss.item()) top1.update(reduced_prec1.item()) top5.update(reduced_prec5.item()) # backward optimizer.zero_grad() if isinstance(optimizer, FusedFP16SGD): optimizer.backward(loss) reduce_gradients(model, args.sync) optimizer.step() elif isinstance(optimizer, FP16SGD): def closure(): # backward optimizer.backward(loss, False) # sync gradients reduce_gradients(model, args.sync) # check overflow, convert to fp32 grads, downscale optimizer.update_master_grads() return loss optimizer.step(closure) else: loss.backward() reduce_gradients(model, args.sync) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) if curr_step % config.print_freq == 0 and rank == 0: tb_logger.add_scalar('loss_train', losses.avg, curr_step) tb_logger.add_scalar('acc1_train', top1.avg, curr_step) tb_logger.add_scalar('acc5_train', top5.avg, curr_step) tb_logger.add_scalar('lr', current_lr, curr_step) logger.info('Iter: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t' 'LR {lr:.4f}'.format( curr_step, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=current_lr)) if curr_step > 0 and curr_step % config.val_freq == 0: val_loss, prec1, prec5 = validate(val_loader, model) if not tb_logger is None: tb_logger.add_scalar('loss_val', val_loss, curr_step) tb_logger.add_scalar('acc1_val', prec1, curr_step) tb_logger.add_scalar('acc5_val', prec5, curr_step) if rank == 0: # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) # save_checkpoint({ # 'step': curr_step, # 'arch': config.model.arch, # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # 'optimizer' : optimizer.state_dict(), # }, is_best, config.save_path+'/ckpt'+str(TIME % 100000)) end = time.time()
def makedir(path): if link.get_rank() == 0 and not os.path.exists(path): os.makedirs(path) link.barrier()