示例#1
0
    def __init__(self, args, ckp):
        super(Model, self).__init__()
        print('Making model...')

        self.scale = args.scale

        self.input_large = args.input_large

        self.precision = args.precision
        self.cpu = args.cpu
        self.device = torch.device('cpu' if args.cpu else 'cuda')
        self.n_GPUs = args.n_GPUs
        self.save_models = args.save_models

        module = import_module(args.model.lower())
        self.model = module.make_model(args).to(self.device)

        if args.model == 'DCSRN':
            self.model.apply(dcsrn_weight_init)
        elif args.model == 'DCRSR':
            self.model.apply(weight_init)
        if args.precision == 'half':
            self.model.half()
            self.model_params, self.master_params = fp16.prep_param_lists(
                self.model)

        self.load(ckp.get_path('model'),
                  pre_train=args.pre_train,
                  resume=args.resume,
                  cpu=args.cpu)
        print(self.model, file=ckp.log_file)
示例#2
0
    def initialize(self, model, inputs_module_destinations,
                   configuration_maps, master_addr, rank,
                   local_rank, num_ranks_in_server):
        self.send_ranks = {}
        self.receive_ranks = {}
        self.rank = rank
        self.local_rank = local_rank
        self.stage = None
        self.tensor_tags = {}
        self.forward_minibatch_id = 0
        self.backward_minibatch_id = 0
        self.criterion_input_name = str(model[-1][1][0])

        tensor_tag = 1
        for (_, input_tensors, output_tensors) in model:
            for input_tensor in input_tensors:
                if input_tensor not in self.tensor_tags:
                    self.tensor_tags[input_tensor] = tensor_tag
                    tensor_tag += 1
            for output_tensor in output_tensors:
                if output_tensor not in self.tensor_tags:
                    self.tensor_tags[output_tensor] = tensor_tag
                    tensor_tag += 1
        for target_tensor_name in sorted(self.target_tensor_names):
            self.tensor_tags[target_tensor_name] = tensor_tag
            tensor_tag += 1
        self.tensor_tags["ack"] = tensor_tag
        tensor_tag += 1

        module_to_stage_map = configuration_maps['module_to_stage_map']
        stage_to_rank_map = configuration_maps['stage_to_rank_map']
        stage_to_depth_map = configuration_maps['stage_to_depth_map']

        if module_to_stage_map is None:
            # If IP addresses not specified, resort to all layers on
            # single machine.
            assert self.rank is None
            self.modules_with_dependencies = ModulesWithDependencies(model)
            self.is_criterion = True
            self.rank_in_stage = 0
            self.num_ranks = 1
            self.num_ranks_in_first_stage = 1
            self.num_ranks_in_previous_stage = 0
            self.num_ranks_in_next_stage = 0
            self.num_stages = 1
            self.num_ranks_in_stage = 1
            self.num_warmup_minibatches = 0
            self.comm_handler = None
        else:
            assert len(module_to_stage_map) == len(model)
            assert self.rank is not None

            stage_to_module_map = collections.defaultdict(list)
            for module in range(len(module_to_stage_map)):
                stage_to_module_map[module_to_stage_map[module]].append(module)

            rank_to_stage_map = {}
            for stage in stage_to_rank_map:
                for rank in stage_to_rank_map[stage]:
                    rank_to_stage_map[rank] = stage

            # Now, use this mapping to determine the modules contained in
            # each stage.
            assert 0 <= self.rank < len(rank_to_stage_map)
            self.num_ranks = len(rank_to_stage_map)
            self.num_stages = len(stage_to_module_map)
            self.stage = rank_to_stage_map[self.rank]
            self.rank_in_stage = stage_to_rank_map[self.stage].index(self.rank)
            self.num_ranks_in_stage = len(stage_to_rank_map[self.stage])
            self.num_ranks_in_first_stage = len(stage_to_rank_map[0])
            self.num_ranks_in_previous_stage = 0
            self.ranks_in_previous_stage = []
            if self.stage > 0:
                self.num_ranks_in_previous_stage = len(
                    stage_to_rank_map[self.stage - 1])
                self.ranks_in_previous_stage = stage_to_rank_map[self.stage - 1]
            self.num_ranks_in_next_stage = 0
            self.ranks_in_next_stage = []
            if self.stage < self.num_stages - 1:
                self.num_ranks_in_next_stage = len(
                    stage_to_rank_map[self.stage + 1])
                self.ranks_in_next_stage = stage_to_rank_map[self.stage + 1]
            modules = stage_to_module_map[self.stage]
            self.modules_with_dependencies = ModulesWithDependencies(
                [model[module] for module in modules])
            self.is_criterion = self.stage == (self.num_stages - 1)
            if stage_to_depth_map is not None:
                self.num_warmup_minibatches = stage_to_depth_map[
                    str(self.stage)]
            else:
                self.num_warmup_minibatches = self.num_ranks - 1
                for i in range(self.stage):
                    self.num_warmup_minibatches -= len(
                        stage_to_rank_map[i])
                self.num_warmup_minibatches = self.num_warmup_minibatches // \
                                              self.num_ranks_in_stage

            # To determine where tensors should be sent and received, first
            # determine the "producing" and "consuming" module IDs of each
            # tensor. We then use the corresponding machine ranks to send
            # and receive tensors.
            master_port = 12345
            self.comm_handler = communication.CommunicationHandler(
                master_addr=master_addr,
                master_port=master_port,
                rank=self.rank,
                local_rank=self.local_rank,
                num_ranks_in_server=num_ranks_in_server,
                world_size=self.num_ranks,
                fp16=self.fp16,
                backend=self.distributed_backend)

            for i in range(len(model)):
                for j in range(i + 1, len(model)):
                    for tensor_name in model[i][2]:
                        if tensor_name in model[j][1]:
                            if module_to_stage_map[i] == \
                                    module_to_stage_map[j]:
                                continue
                            # For now, assume that each stage is served by only
                            # a single machine.
                            if module_to_stage_map[j] == self.stage:
                                self.receive_ranks[tensor_name] = \
                                    stage_to_rank_map[module_to_stage_map[i]]
                            if module_to_stage_map[i] == self.stage:
                                self.send_ranks[tensor_name] = \
                                    stage_to_rank_map[module_to_stage_map[j]]

            for model_inputs in inputs_module_destinations.keys():
                destination_stage = module_to_stage_map[
                    inputs_module_destinations[model_inputs]]
                if destination_stage > self.stage:
                    self.send_ranks[model_inputs] = \
                        self.ranks_in_next_stage

                if 0 < self.stage <= destination_stage:
                    self.receive_ranks[model_inputs] = \
                        self.ranks_in_previous_stage

                if destination_stage > 0:
                    if model_inputs not in self.tensor_tags:
                        self.tensor_tags[model_inputs] = tensor_tag
                        tensor_tag += 1

        modules = self.modules_with_dependencies.modules()
        for i in range(len(modules)):
            modules[i] = modules[i].cuda()
            if self.fp16:
                import apex.fp16_utils as fp16_utils
                modules[i] = fp16_utils.BN_convert_float(modules[i].half())

        # Initialize all groups in the same order on every worker.
        if stage_to_rank_map is not None:
            groups = []
            for stage in range(self.num_stages):
                ranks = stage_to_rank_map[stage]
                if len(ranks) > 1:
                    groups.append(dist.new_group(ranks=ranks))
                else:
                    groups.append(None)
            group = groups[self.stage]
        else:
            group = None

        # self.modules_with_dependencies contains a list of PyTorch
        # modules, along with a list of user-defined input and output
        # tensor names. We use our module_executor.ModuleExecutor
        # class to wrap these dependencies, and use run_forward and
        # run_backward methods downstream.
        num_parameters = 0
        for i in range(len(modules)):
            if group is not None:
                if ((i < (len(modules) - 1) and self.is_criterion)
                        or not self.is_criterion):
                    num_parameters += \
                        sum(x.size()[0] * x.size()[1]
                            if len(x.size()) > 1 else x.size()[0]
                            for x in modules[i].parameters() if x.size())
                    modules[i] = torch.nn.parallel.DistributedDataParallel(
                        modules[i],
                        process_group=group,
                        device_ids=[local_rank],
                        output_device=local_rank)
        if self.num_ranks_in_stage > 1:
            module_size = 4. * num_parameters
            print("Replicating stage: ranks=%d, module_size=%.3f" % (
                self.num_ranks_in_stage, module_size))

        if self.fp16:
            self.master_parameters = []
            self.model_parameters = []
            for i in range(len(modules)):
                import apex.fp16_utils as fp16_utils
                module_parameters, module_master_parameters = \
                    fp16_utils.prep_param_lists(modules[i])
                self.master_parameters.extend(module_master_parameters)
                self.model_parameters.extend(module_parameters)
        else:
            self.master_parameters = list(self.parameters())
            self.model_parameters = None

        if self.comm_handler is not None:
            self.comm_handler.initialize(
                self.receive_ranks,
                self.send_ranks,
                self.tensor_tags,
                self.target_tensor_names,
                self.training_tensor_dtypes,
                self.rank_in_stage,
                self.num_ranks_in_stage,
                self.ranks_in_previous_stage,
                self.ranks_in_next_stage)
示例#3
0
def benchmark_training(model, opts):
    """Benchmarks training phase.

    :param obj model: A model to benchmark
    :param dict opts: A dictionary of parameters.
    :rtype: tuple:
    :return: A tuple of (model_name, list of batch times)
    """
    def _reduce_tensor(tensor):
        reduced = tensor.clone()
        dist.all_reduce(reduced, op=dist.reduce_op.SUM)
        reduced /= opts['world_size']
        return reduced

    if opts['phase'] != 'training':
        raise "Phase in benchmark_training func is '%s'" % opts['phase']

    opts['distributed'] = opts['world_size'] > 1
    opts['with_cuda'] = opts['device'] == 'gpu'
    opts['fp16'] = opts['dtype'] == 'float16'
    opts['loss_scale'] = 1

    if opts['fp16'] and not opts['with_cuda']:
        raise ValueError(
            "Configuration error: FP16 can only be used with GPUs")

    if opts['with_cuda']:
        torch.cuda.set_device(opts['local_rank'])
        cudnn.benchmark = opts['cudnn_benchmark']
        cudnn.fastest = opts['cudnn_fastest']

    if opts['distributed']:
        dist.init_process_group(backend=opts['dist_backend'],
                                init_method='env://')

    if opts['with_cuda']:
        model = model.cuda()
        if opts['dtype'] == 'float16':
            model = network_to_half(model)

    if opts['distributed']:
        model = DDP(model, shared_param=True)

    if opts['fp16']:
        model_params, master_params = prep_param_lists(model)
    else:
        master_params = list(model.parameters())

    criterion = nn.CrossEntropyLoss()
    if opts['with_cuda']:
        criterion = criterion.cuda()
    optimizer = optim.SGD(master_params,
                          lr=0.01,
                          momentum=0.9,
                          weight_decay=1e-4)

    data_loader = DatasetFactory.get_data_loader(opts, opts['__input_shape'],
                                                 opts['__num_classes'])

    is_warmup = opts['num_warmup_batches'] > 0
    done = opts['num_warmup_batches'] == 0
    num_iterations_done = 0
    model.train()
    batch_times = np.zeros(opts['num_batches'])
    end_time = timeit.default_timer()
    while not done:
        prefetcher = DataPrefetcher(data_loader, opts)
        batch_data, batch_labels = prefetcher.next()
        while batch_data is not None:
            data_var = torch.autograd.Variable(batch_data)
            labels_var = torch.autograd.Variable(batch_labels)

            output = model(data_var)

            loss = criterion(output, labels_var)
            loss = loss * opts['loss_scale']
            # I'll need this for reporting
            #reduced_loss = _reduce_tensor(loss.data) if opts['distributed'] else loss.data

            if opts['fp16']:
                model.zero_grad()
                loss.backward()
                model_grads_to_master_grads(model_params, master_params)
                if opts['loss_scale'] != 1:
                    for param in master_params:
                        param.grad.data = param.grad.data / opts['loss_scale']
                optimizer.step()
                master_params_to_model_params(model_params, master_params)
            else:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if opts['with_cuda']:
                torch.cuda.synchronize()

            # Track progress
            num_iterations_done += 1
            cur_time = timeit.default_timer()

            batch_data, batch_labels = prefetcher.next()

            if is_warmup:
                if num_iterations_done >= opts['num_warmup_batches']:
                    is_warmup = False
                    num_iterations_done = 0
            else:
                if opts['num_batches'] != 0:
                    batch_times[num_iterations_done - 1] = cur_time - end_time
                if num_iterations_done >= opts['num_batches']:
                    done = True
                    break
            end_time = cur_time

    return (opts['__name'], batch_times)
示例#4
0
        if master_param.grad is None:
            master_param.grad = master_param.new(*master_param.size())
        return model_params, [master_param]
    else:
        master_params = [
            param.clone().float().detach() for param in model_params
        ]
        for param in master_params:
            param.requires_grad_(True)
        return model_params, master_params


# The util function from Apex to do this is `prep_param_lists`.

model_p, master_p = get_master(model)
model_p1, master_p1 = fp16.prep_param_lists(model)


def same_lists(ps1, ps2):
    assert len(ps1) == len(ps2)
    for (p1, p2) in zip(ps1, ps2):
        assert p1.requires_grad == p2.requires_grad
        assert torch.allclose(p1.data.float(), p2.data.float())


same_lists(model_p, model_p1)
same_lists(model_p, master_p)
same_lists(master_p, master_p1)
same_lists(model_p1, master_p1)

# We can't use flat_master when there is a mix of FP32 and FP16 parameters (like batchnorm here).