def __init__(self, args, ckp): super(Model, self).__init__() print('Making model...') self.scale = args.scale self.input_large = args.input_large self.precision = args.precision self.cpu = args.cpu self.device = torch.device('cpu' if args.cpu else 'cuda') self.n_GPUs = args.n_GPUs self.save_models = args.save_models module = import_module(args.model.lower()) self.model = module.make_model(args).to(self.device) if args.model == 'DCSRN': self.model.apply(dcsrn_weight_init) elif args.model == 'DCRSR': self.model.apply(weight_init) if args.precision == 'half': self.model.half() self.model_params, self.master_params = fp16.prep_param_lists( self.model) self.load(ckp.get_path('model'), pre_train=args.pre_train, resume=args.resume, cpu=args.cpu) print(self.model, file=ckp.log_file)
def initialize(self, model, inputs_module_destinations, configuration_maps, master_addr, rank, local_rank, num_ranks_in_server): self.send_ranks = {} self.receive_ranks = {} self.rank = rank self.local_rank = local_rank self.stage = None self.tensor_tags = {} self.forward_minibatch_id = 0 self.backward_minibatch_id = 0 self.criterion_input_name = str(model[-1][1][0]) tensor_tag = 1 for (_, input_tensors, output_tensors) in model: for input_tensor in input_tensors: if input_tensor not in self.tensor_tags: self.tensor_tags[input_tensor] = tensor_tag tensor_tag += 1 for output_tensor in output_tensors: if output_tensor not in self.tensor_tags: self.tensor_tags[output_tensor] = tensor_tag tensor_tag += 1 for target_tensor_name in sorted(self.target_tensor_names): self.tensor_tags[target_tensor_name] = tensor_tag tensor_tag += 1 self.tensor_tags["ack"] = tensor_tag tensor_tag += 1 module_to_stage_map = configuration_maps['module_to_stage_map'] stage_to_rank_map = configuration_maps['stage_to_rank_map'] stage_to_depth_map = configuration_maps['stage_to_depth_map'] if module_to_stage_map is None: # If IP addresses not specified, resort to all layers on # single machine. assert self.rank is None self.modules_with_dependencies = ModulesWithDependencies(model) self.is_criterion = True self.rank_in_stage = 0 self.num_ranks = 1 self.num_ranks_in_first_stage = 1 self.num_ranks_in_previous_stage = 0 self.num_ranks_in_next_stage = 0 self.num_stages = 1 self.num_ranks_in_stage = 1 self.num_warmup_minibatches = 0 self.comm_handler = None else: assert len(module_to_stage_map) == len(model) assert self.rank is not None stage_to_module_map = collections.defaultdict(list) for module in range(len(module_to_stage_map)): stage_to_module_map[module_to_stage_map[module]].append(module) rank_to_stage_map = {} for stage in stage_to_rank_map: for rank in stage_to_rank_map[stage]: rank_to_stage_map[rank] = stage # Now, use this mapping to determine the modules contained in # each stage. assert 0 <= self.rank < len(rank_to_stage_map) self.num_ranks = len(rank_to_stage_map) self.num_stages = len(stage_to_module_map) self.stage = rank_to_stage_map[self.rank] self.rank_in_stage = stage_to_rank_map[self.stage].index(self.rank) self.num_ranks_in_stage = len(stage_to_rank_map[self.stage]) self.num_ranks_in_first_stage = len(stage_to_rank_map[0]) self.num_ranks_in_previous_stage = 0 self.ranks_in_previous_stage = [] if self.stage > 0: self.num_ranks_in_previous_stage = len( stage_to_rank_map[self.stage - 1]) self.ranks_in_previous_stage = stage_to_rank_map[self.stage - 1] self.num_ranks_in_next_stage = 0 self.ranks_in_next_stage = [] if self.stage < self.num_stages - 1: self.num_ranks_in_next_stage = len( stage_to_rank_map[self.stage + 1]) self.ranks_in_next_stage = stage_to_rank_map[self.stage + 1] modules = stage_to_module_map[self.stage] self.modules_with_dependencies = ModulesWithDependencies( [model[module] for module in modules]) self.is_criterion = self.stage == (self.num_stages - 1) if stage_to_depth_map is not None: self.num_warmup_minibatches = stage_to_depth_map[ str(self.stage)] else: self.num_warmup_minibatches = self.num_ranks - 1 for i in range(self.stage): self.num_warmup_minibatches -= len( stage_to_rank_map[i]) self.num_warmup_minibatches = self.num_warmup_minibatches // \ self.num_ranks_in_stage # To determine where tensors should be sent and received, first # determine the "producing" and "consuming" module IDs of each # tensor. We then use the corresponding machine ranks to send # and receive tensors. master_port = 12345 self.comm_handler = communication.CommunicationHandler( master_addr=master_addr, master_port=master_port, rank=self.rank, local_rank=self.local_rank, num_ranks_in_server=num_ranks_in_server, world_size=self.num_ranks, fp16=self.fp16, backend=self.distributed_backend) for i in range(len(model)): for j in range(i + 1, len(model)): for tensor_name in model[i][2]: if tensor_name in model[j][1]: if module_to_stage_map[i] == \ module_to_stage_map[j]: continue # For now, assume that each stage is served by only # a single machine. if module_to_stage_map[j] == self.stage: self.receive_ranks[tensor_name] = \ stage_to_rank_map[module_to_stage_map[i]] if module_to_stage_map[i] == self.stage: self.send_ranks[tensor_name] = \ stage_to_rank_map[module_to_stage_map[j]] for model_inputs in inputs_module_destinations.keys(): destination_stage = module_to_stage_map[ inputs_module_destinations[model_inputs]] if destination_stage > self.stage: self.send_ranks[model_inputs] = \ self.ranks_in_next_stage if 0 < self.stage <= destination_stage: self.receive_ranks[model_inputs] = \ self.ranks_in_previous_stage if destination_stage > 0: if model_inputs not in self.tensor_tags: self.tensor_tags[model_inputs] = tensor_tag tensor_tag += 1 modules = self.modules_with_dependencies.modules() for i in range(len(modules)): modules[i] = modules[i].cuda() if self.fp16: import apex.fp16_utils as fp16_utils modules[i] = fp16_utils.BN_convert_float(modules[i].half()) # Initialize all groups in the same order on every worker. if stage_to_rank_map is not None: groups = [] for stage in range(self.num_stages): ranks = stage_to_rank_map[stage] if len(ranks) > 1: groups.append(dist.new_group(ranks=ranks)) else: groups.append(None) group = groups[self.stage] else: group = None # self.modules_with_dependencies contains a list of PyTorch # modules, along with a list of user-defined input and output # tensor names. We use our module_executor.ModuleExecutor # class to wrap these dependencies, and use run_forward and # run_backward methods downstream. num_parameters = 0 for i in range(len(modules)): if group is not None: if ((i < (len(modules) - 1) and self.is_criterion) or not self.is_criterion): num_parameters += \ sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in modules[i].parameters() if x.size()) modules[i] = torch.nn.parallel.DistributedDataParallel( modules[i], process_group=group, device_ids=[local_rank], output_device=local_rank) if self.num_ranks_in_stage > 1: module_size = 4. * num_parameters print("Replicating stage: ranks=%d, module_size=%.3f" % ( self.num_ranks_in_stage, module_size)) if self.fp16: self.master_parameters = [] self.model_parameters = [] for i in range(len(modules)): import apex.fp16_utils as fp16_utils module_parameters, module_master_parameters = \ fp16_utils.prep_param_lists(modules[i]) self.master_parameters.extend(module_master_parameters) self.model_parameters.extend(module_parameters) else: self.master_parameters = list(self.parameters()) self.model_parameters = None if self.comm_handler is not None: self.comm_handler.initialize( self.receive_ranks, self.send_ranks, self.tensor_tags, self.target_tensor_names, self.training_tensor_dtypes, self.rank_in_stage, self.num_ranks_in_stage, self.ranks_in_previous_stage, self.ranks_in_next_stage)
def benchmark_training(model, opts): """Benchmarks training phase. :param obj model: A model to benchmark :param dict opts: A dictionary of parameters. :rtype: tuple: :return: A tuple of (model_name, list of batch times) """ def _reduce_tensor(tensor): reduced = tensor.clone() dist.all_reduce(reduced, op=dist.reduce_op.SUM) reduced /= opts['world_size'] return reduced if opts['phase'] != 'training': raise "Phase in benchmark_training func is '%s'" % opts['phase'] opts['distributed'] = opts['world_size'] > 1 opts['with_cuda'] = opts['device'] == 'gpu' opts['fp16'] = opts['dtype'] == 'float16' opts['loss_scale'] = 1 if opts['fp16'] and not opts['with_cuda']: raise ValueError( "Configuration error: FP16 can only be used with GPUs") if opts['with_cuda']: torch.cuda.set_device(opts['local_rank']) cudnn.benchmark = opts['cudnn_benchmark'] cudnn.fastest = opts['cudnn_fastest'] if opts['distributed']: dist.init_process_group(backend=opts['dist_backend'], init_method='env://') if opts['with_cuda']: model = model.cuda() if opts['dtype'] == 'float16': model = network_to_half(model) if opts['distributed']: model = DDP(model, shared_param=True) if opts['fp16']: model_params, master_params = prep_param_lists(model) else: master_params = list(model.parameters()) criterion = nn.CrossEntropyLoss() if opts['with_cuda']: criterion = criterion.cuda() optimizer = optim.SGD(master_params, lr=0.01, momentum=0.9, weight_decay=1e-4) data_loader = DatasetFactory.get_data_loader(opts, opts['__input_shape'], opts['__num_classes']) is_warmup = opts['num_warmup_batches'] > 0 done = opts['num_warmup_batches'] == 0 num_iterations_done = 0 model.train() batch_times = np.zeros(opts['num_batches']) end_time = timeit.default_timer() while not done: prefetcher = DataPrefetcher(data_loader, opts) batch_data, batch_labels = prefetcher.next() while batch_data is not None: data_var = torch.autograd.Variable(batch_data) labels_var = torch.autograd.Variable(batch_labels) output = model(data_var) loss = criterion(output, labels_var) loss = loss * opts['loss_scale'] # I'll need this for reporting #reduced_loss = _reduce_tensor(loss.data) if opts['distributed'] else loss.data if opts['fp16']: model.zero_grad() loss.backward() model_grads_to_master_grads(model_params, master_params) if opts['loss_scale'] != 1: for param in master_params: param.grad.data = param.grad.data / opts['loss_scale'] optimizer.step() master_params_to_model_params(model_params, master_params) else: optimizer.zero_grad() loss.backward() optimizer.step() if opts['with_cuda']: torch.cuda.synchronize() # Track progress num_iterations_done += 1 cur_time = timeit.default_timer() batch_data, batch_labels = prefetcher.next() if is_warmup: if num_iterations_done >= opts['num_warmup_batches']: is_warmup = False num_iterations_done = 0 else: if opts['num_batches'] != 0: batch_times[num_iterations_done - 1] = cur_time - end_time if num_iterations_done >= opts['num_batches']: done = True break end_time = cur_time return (opts['__name'], batch_times)
if master_param.grad is None: master_param.grad = master_param.new(*master_param.size()) return model_params, [master_param] else: master_params = [ param.clone().float().detach() for param in model_params ] for param in master_params: param.requires_grad_(True) return model_params, master_params # The util function from Apex to do this is `prep_param_lists`. model_p, master_p = get_master(model) model_p1, master_p1 = fp16.prep_param_lists(model) def same_lists(ps1, ps2): assert len(ps1) == len(ps2) for (p1, p2) in zip(ps1, ps2): assert p1.requires_grad == p2.requires_grad assert torch.allclose(p1.data.float(), p2.data.float()) same_lists(model_p, model_p1) same_lists(model_p, master_p) same_lists(master_p, master_p1) same_lists(model_p1, master_p1) # We can't use flat_master when there is a mix of FP32 and FP16 parameters (like batchnorm here).