def build_eval_pipe(args):
    # Paths
    val_annotate = os.path.join(args.data, "annotations/bbox_only_instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")

    input_size = args.input_size
    val_trans = SSDTransformer((input_size, input_size), val=True)
    cocoGt = COCO(annotation_file=val_annotate, use_ext=True)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans, cocoGt.dataset)
    log_event(key=constants.EVAL_SAMPLES, value=len(val_coco))

    if args.distributed:
        val_sampler = GeneralDistributedSampler(val_coco, pad=False)
    else:
        val_sampler = None

    val_dataloader   = DataLoader(val_coco,
                                  batch_size=args.eval_batch_size,
                                  shuffle=False, # Note: distributed sampler is shuffled :(
                                  sampler=val_sampler,
                                  num_workers=args.num_workers)

    inv_map = {v:k for k,v in val_coco.label_map.items()}

    return val_dataloader, inv_map, cocoGt
def mlperf_submission_log(benchmark):

    num_nodes = os.environ.get('SLURM_JOB_NUM_NODES', 1)

    configure_logger(benchmark)

    log_event(
        key=constants.SUBMISSION_BENCHMARK,
        value=benchmark,
        )

    log_event(
        key=constants.SUBMISSION_ORG,
        value='Fujitsu')

    log_event(
        key=constants.SUBMISSION_DIVISION,
        value='closed')

    log_event(
        key=constants.SUBMISSION_STATUS,
        value='onprem')

    log_event(
        key=constants.SUBMISSION_PLATFORM,
        value=f'1xGX2570M5')
Пример #3
0
    def __init__(self):

        self.sample_options = (
            # Do nothing
            None,
            # min IoU, max IoU
            (0.1, None),
            (0.3, None),
            (0.5, None),
            (0.7, None),
            (0.9, None),
            # no IoU requirements
            (None, None),
        )
        # Implementation uses 1 iteration to find a possible candidate, this
        # was shown to produce the same mAP as using more iterations.
        self.num_cropping_iterations = 1
        log_event(key=constants.MAX_SAMPLES,
                  value=self.num_cropping_iterations)
Пример #4
0
def check_async_evals(args, evaluator, threshold):
    finished = 0
    # Note: only one rank does COCOEval, so we need to check there if we've
    # finished -- we'll broadcast that to a "finished" tensor to determine
    # if we should stop
    # Note2: ssd_print contains a barrier() call, implemented with all_reduce
    #        If we conditional on rank 0, then an ssd_print all_reduce matches with
    #        the finished all_reduce and all hell breaks loose.
    if args.rank == 0:
        for epoch, current_accuracy in evaluator.finished_tasks().items():
            # Note: Move to per-iter check
            # EVAL_START should be prior to the accuracy/score evaluation but adding the missing EVAL_START here for now
            log_start(key=constants.EVAL_START,
                      metadata={'epoch_num': epoch + 1})
            log_event(key=constants.EVAL_ACCURACY,
                      value=current_accuracy,
                      metadata={'epoch_num': epoch + 1})
            log_end(key=constants.EVAL_STOP, metadata={'epoch_num': epoch + 1})
            if current_accuracy >= threshold:
                finished = 1

    # handle the non-distributed case -- don't need to bcast, just take local result
    if not args.distributed:
        return finished == 1

    # Now we know from all ranks if they're done - reduce result
    # Note: Already caught the non-distributed case above, can assume broadcast is available
    with torch.no_grad():
        finish_tensor = torch.tensor([finished],
                                     dtype=torch.int32,
                                     device=torch.device('cuda'))
        # torch.distributed.all_reduce(finish_tensor)
        torch.distributed.broadcast(finish_tensor, src=0)

        # >= 1 ranks has seen final accuracy
        if finish_tensor.item() >= 1:
            return True

    # Default case: No results, or no accuracte enough results
    return False
Пример #5
0
def build_pipeline(args, training=True, pipe=None):
    # Handle training / testing differently due to different
    # outputs. But still want to do this to abstract out the
    # use of EncodingInputIterator and RateMatcher
    if training:
        builder_fn = build_dali_pipeline if args.dali else build_native_pipeline
        train_loader, epoch_size = builder_fn(args, training=True, pipe=pipe)
        log_event(key=constants.TRAIN_SAMPLES, value=epoch_size)

        train_loader = ConvertDaliInputIterator(train_loader)

        if args.fake_input:
            train_loader = FakeInputIterator(train_loader, epoch_size,
                                             args.N_gpu)

        if args.input_batch_multiplier > 1:
            train_loader = RateMatcher(input_it=train_loader,
                                       output_size=args.batch_size)

        return train_loader, epoch_size
    else:
        return build_native_pipeline(args, training=False)
Пример #6
0
    def __init__(self, optimizer, start_warmup_steps, warmup_steps, total_steps, end_learning_rate=0.0, degree=1.0, last_epoch=-1):
        self.num_warmup_updates = warmup_steps
        self.start_warmup_steps = start_warmup_steps
        self.total_steps = total_steps
        self.end_learning_rate = end_learning_rate
        self.degree = degree
        super(LinearWarmupPolyDecayScheduler, self).__init__(optimizer, last_epoch)

        mlperf_logger.log_event(key=mlperf_logger.constants.OPT_LR_WARMUP_STEPS, value=self.num_warmup_updates, sync=False)
        mlperf_logger.log_event(key='opt_lamb_learning_rate_decay_poly_power', value=degree, sync=False)
        mlperf_logger.log_event(key='start_warmup_step', value=self.start_warmup_steps, sync=False)
Пример #7
0
def make_criteo_data_and_loaders(args, offset_to_length_converter=False):
    if args.mlperf_logging and args.memory_map and args.data_set == "terabyte":
        # more efficient for larger batches
        data_directory = path.dirname(args.raw_data_file)

        if args.mlperf_bin_loader:
            lstr = args.processed_data_file.split("/")
            d_path = "/".join(lstr[0:-1]) + "/" + lstr[-1].split(".")[0]
            train_file = d_path + "_train.bin"
            test_file = d_path + "_test.bin"
            # val_file = d_path + "_val.bin"
            counts_file = args.raw_data_file + '_fea_count.npz'

            if any(not path.exists(p) for p in [train_file,
                                                test_file,
                                                counts_file]):
                ensure_dataset_preprocessed(args, d_path)

            train_data = data_loader_terabyte.CriteoBinDataset(
                data_file=train_file,
                counts_file=counts_file,
                batch_size=args.mini_batch_size,
                max_ind_range=args.max_ind_range
            )

            mlperf_logger.log_event(key=mlperf_logger.constants.TRAIN_SAMPLES,
                                    value=train_data.num_samples)

            train_loader = torch.utils.data.DataLoader(
                train_data,
                batch_size=None,
                batch_sampler=None,
                shuffle=False,
                num_workers=0,
                collate_fn=None,
                pin_memory=False,
                drop_last=False,
                sampler=RandomSampler(train_data) if args.mlperf_bin_shuffle else None
            )

            test_data = data_loader_terabyte.CriteoBinDataset(
                data_file=test_file,
                counts_file=counts_file,
                batch_size=args.test_mini_batch_size,
                max_ind_range=args.max_ind_range
            )

            mlperf_logger.log_event(key=mlperf_logger.constants.EVAL_SAMPLES,
                                    value=test_data.num_samples)

            test_loader = torch.utils.data.DataLoader(
                test_data,
                batch_size=None,
                batch_sampler=None,
                shuffle=False,
                num_workers=0,
                collate_fn=None,
                pin_memory=False,
                drop_last=False,
            )
        else:
            data_filename = args.raw_data_file.split("/")[-1]

            train_data = CriteoDataset(
                args.data_set,
                args.max_ind_range,
                args.data_sub_sample_rate,
                args.data_randomize,
                "train",
                args.raw_data_file,
                args.processed_data_file,
                args.memory_map,
                args.dataset_multiprocessing
            )

            test_data = CriteoDataset(
                args.data_set,
                args.max_ind_range,
                args.data_sub_sample_rate,
                args.data_randomize,
                "test",
                args.raw_data_file,
                args.processed_data_file,
                args.memory_map,
                args.dataset_multiprocessing
            )

            train_loader = data_loader_terabyte.DataLoader(
                data_directory=data_directory,
                data_filename=data_filename,
                days=list(range(23)),
                batch_size=args.mini_batch_size,
                max_ind_range=args.max_ind_range,
                split="train"
            )

            test_loader = data_loader_terabyte.DataLoader(
                data_directory=data_directory,
                data_filename=data_filename,
                days=[23],
                batch_size=args.test_mini_batch_size,
                max_ind_range=args.max_ind_range,
                split="test"
            )
    else:
        train_data = CriteoDataset(
            args.data_set,
            args.max_ind_range,
            args.data_sub_sample_rate,
            args.data_randomize,
            "train",
            args.raw_data_file,
            args.processed_data_file,
            args.memory_map,
            args.dataset_multiprocessing,
        )

        test_data = CriteoDataset(
            args.data_set,
            args.max_ind_range,
            args.data_sub_sample_rate,
            args.data_randomize,
            "test",
            args.raw_data_file,
            args.processed_data_file,
            args.memory_map,
            args.dataset_multiprocessing,
        )

        collate_wrapper_criteo = collate_wrapper_criteo_offset
        if offset_to_length_converter:
            collate_wrapper_criteo = collate_wrapper_criteo_length

        train_loader = torch.utils.data.DataLoader(
            train_data,
            batch_size=args.mini_batch_size,
            shuffle=False,
            num_workers=args.num_workers,
            collate_fn=collate_wrapper_criteo,
            pin_memory=False,
            drop_last=False,  # True
        )

        test_loader = torch.utils.data.DataLoader(
            test_data,
            batch_size=args.test_mini_batch_size,
            shuffle=False,
            num_workers=args.test_num_workers,
            collate_fn=collate_wrapper_criteo,
            pin_memory=False,
            drop_last=False,  # True
        )

    return train_data, train_loader, test_data, test_loader
Пример #8
0
def train300_mlperf_coco(args):

    args = setup_distributed(args)

    # Build the model
    model_options = {
        'use_nhwc': args.nhwc,
        'pad_input': args.pad_input,
        'bn_group': args.bn_group,
    }

    ssd300 = SSD300(args, args.num_classes, **model_options)
    if args.checkpoint is not None:
        load_checkpoint(ssd300, args.checkpoint)

    ssd300.train()
    ssd300.cuda()
    dboxes = dboxes300_coco()
    # Note: No reason not to use optimised loss
    loss_func = OptLoss()
    loss_func.cuda()

    # Create optimizer.  This must also be done after network_to_half.
    global_batch_size = (args.N_gpu * args.batch_size)
    log_event(key=constants.MODEL_BN_SPAN,
              value=args.bn_group * args.batch_size)
    log_event(key=constants.GLOBAL_BATCH_SIZE, value=global_batch_size)

    # mlperf only allows base_lr scaled by an integer
    base_lr = 2.5e-3
    requested_lr_multiplier = args.lr / base_lr
    adjusted_multiplier = max(
        1, round(requested_lr_multiplier * global_batch_size / 32))

    current_lr = base_lr * adjusted_multiplier
    current_momentum = 0.9
    current_weight_decay = args.wd
    static_loss_scale = 128.

    optim = apex.optimizers.FusedSGD(ssd300.parameters(),
                                     lr=current_lr,
                                     momentum=current_momentum,
                                     weight_decay=current_weight_decay)

    ssd300, optim = apex.amp.initialize(ssd300,
                                        optim,
                                        opt_level='O2',
                                        loss_scale=static_loss_scale)

    # Parallelize.  Need to do this after network_to_half.
    if args.distributed:
        if args.delay_allreduce:
            print_message(args.local_rank,
                          "Delaying allreduces to the end of backward()")
        ssd300 = DDP(ssd300,
                     gradient_predivide_factor=args.N_gpu / 8.0,
                     delay_allreduce=args.delay_allreduce,
                     retain_allreduce_buffers=args.use_fp16)

    log_event(key=constants.OPT_BASE_LR, value=current_lr)
    log_event(key=constants.OPT_LR_DECAY_BOUNDARY_EPOCHS,
              value=args.lr_decay_epochs)
    log_event(key=constants.OPT_LR_DECAY_STEPS, value=args.lr_decay_epochs)
    log_event(key=constants.OPT_WEIGHT_DECAY, value=current_weight_decay)
    if args.warmup is not None:
        log_event(key=constants.OPT_LR_WARMUP_STEPS, value=args.warmup)
        log_event(key=constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor)

    # Model is completely finished -- need to create separate copies, preserve parameters across
    # them, and jit
    ssd300_eval = SSD300(args, args.num_classes, **model_options).cuda()

    if args.use_fp16:
        convert_network(ssd300_eval, torch.half)

    # Get the existant state from the train model
    # * if we use distributed, then we want .module
    train_model = ssd300.module if args.distributed else ssd300
    ssd300_eval.load_state_dict(train_model.state_dict())
    ssd300_eval.eval()

    print_message(args.local_rank, "epoch", "nbatch", "loss")

    iter_num = args.iteration
    avg_loss = 0.0

    start_elapsed_time = time.time()
    last_printed_iter = args.iteration
    num_elapsed_samples = 0

    input_c = 4 if args.pad_input else 3
    example_shape = [args.batch_size, 300, 300, input_c
                     ] if args.nhwc else [args.batch_size, input_c, 300, 300]
    example_input = torch.randn(*example_shape).cuda()

    if args.use_fp16:
        example_input = example_input.half()
    if args.jit:
        # DDP has some Python-side control flow.  If we JIT the entire DDP-wrapped module,
        # the resulting ScriptModule will elide this control flow, resulting in allreduce
        # hooks not being called.  If we're running distributed, we need to extract and JIT
        # the wrapped .module.
        # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks
        # to go out of scope, and therefore silently disappear.
        module_to_jit = ssd300.module if args.distributed else ssd300
        if args.distributed:
            ssd300.module = torch.jit.trace(module_to_jit,
                                            example_input,
                                            check_trace=False)
        else:
            ssd300 = torch.jit.trace(module_to_jit,
                                     example_input,
                                     check_trace=False)
        # JIT the eval model too
        ssd300_eval = torch.jit.trace(ssd300_eval,
                                      example_input,
                                      check_trace=False)

    # do a dummy fprop & bprop to make sure cudnnFind etc. are timed here
    ploc, plabel = ssd300(example_input)

    # produce a single dummy "loss" to make things easier
    loss = ploc[0, 0, 0] + plabel[0, 0, 0]
    dloss = torch.randn_like(loss)
    # Cause cudnnFind for dgrad, wgrad to run
    loss.backward(dloss)

    # Necessary import in init
    from pycocotools.coco import COCO

    encoder = build_ssd300_coder()

    evaluator = AsyncEvaluator(num_threads=1)

    log_end(key=constants.INIT_STOP)

    ##### END INIT

    # This is the first place we touch anything related to data
    ##### START DATA TOUCHING
    barrier()
    log_start(key=constants.RUN_START)
    barrier()

    train_pipe = prebuild_pipeline(args)

    train_loader, epoch_size = build_pipeline(args,
                                              training=True,
                                              pipe=train_pipe)
    if args.rank == 0:
        print("epoch size is: ", epoch_size, " images")

    val_loader, inv_map, cocoGt = build_pipeline(args, training=False)
    if args.profile_gc_off:
        gc.disable()
        gc.collect()

    ##### END DATA TOUCHING
    i_eval = 0
    block_start_epoch = 1
    log_start(key=constants.BLOCK_START,
              metadata={
                  'first_epoch_num': block_start_epoch,
                  'epoch_count': args.evaluation[i_eval]
              })
    for epoch in range(args.epochs):
        for p in ssd300.parameters():
            p.grad = None

        if epoch in args.evaluation:
            # Get the existant state from the train model
            # * if we use distributed, then we want .module
            train_model = ssd300.module if args.distributed else ssd300

            if args.distributed and args.allreduce_running_stats:
                if args.rank == 0: print("averaging bn running means and vars")
                # make sure every node has the same running bn stats before
                # using them to evaluate, or saving the model for inference
                world_size = float(torch.distributed.get_world_size())
                for bn_name, bn_buf in train_model.named_buffers(recurse=True):
                    if ('running_mean' in bn_name) or ('running_var'
                                                       in bn_name):
                        torch.distributed.all_reduce(bn_buf,
                                                     op=dist.ReduceOp.SUM)
                        bn_buf /= world_size

            if args.rank == 0:
                if args.save:
                    print("saving model...")
                    if not os.path.isdir('./models'):
                        os.mkdir('./models')
                    torch.save({"model": ssd300.state_dict()},
                               "./models/iter_{}.pt".format(iter_num))

            ssd300_eval.load_state_dict(train_model.state_dict())
            # Note: No longer returns, evaluation is abstracted away inside evaluator
            coco_eval(args,
                      ssd300_eval,
                      val_loader,
                      cocoGt,
                      encoder,
                      inv_map,
                      epoch,
                      iter_num,
                      evaluator=evaluator)
            log_end(key=constants.BLOCK_STOP,
                    metadata={'first_epoch_num': block_start_epoch})
            if epoch != max(args.evaluation):
                i_eval += 1
                block_start_epoch = epoch + 1
                log_start(key=constants.BLOCK_START,
                          metadata={
                              'first_epoch_num':
                              block_start_epoch,
                              'epoch_count': (args.evaluation[i_eval] -
                                              args.evaluation[i_eval - 1])
                          })

        if epoch in args.lr_decay_epochs:
            current_lr *= args.lr_decay_factor
            print_message(
                args.rank,
                "lr decay step #" + str(bisect(args.lr_decay_epochs, epoch)))
            for param_group in optim.param_groups:
                param_group['lr'] = current_lr

        log_start(key=constants.EPOCH_START,
                  metadata={
                      'epoch_num': epoch + 1,
                      'current_iter_num': iter_num
                  })

        for i, (img, bbox, label) in enumerate(train_loader):

            if args.profile_start is not None and iter_num == args.profile_start:
                torch.cuda.profiler.start()
                torch.cuda.synchronize()
                if args.profile_nvtx:
                    torch.autograd._enable_profiler(
                        torch.autograd.ProfilerState.NVTX)

            if args.profile is not None and iter_num == args.profile:
                if args.profile_start is not None and iter_num >= args.profile_start:
                    # we turned cuda and nvtx profiling on, better turn it off too
                    if args.profile_nvtx:
                        torch.autograd._disable_profiler()
                    torch.cuda.profiler.stop()
                return

            if args.warmup is not None:
                lr_warmup(optim, args.warmup, iter_num, epoch, current_lr,
                          args)

            if (img is None) or (bbox is None) or (label is None):
                print("No labels in batch")
                continue

            ploc, plabel = ssd300(img)
            ploc, plabel = ploc.float(), plabel.float()

            N = img.shape[0]
            bbox.requires_grad = False
            label.requires_grad = False
            # reshape (N*8732X4 -> Nx8732x4) and transpose (Nx8732x4 -> Nx4x8732)
            bbox = bbox.view(N, -1, 4).transpose(1, 2).contiguous()
            # reshape (N*8732 -> Nx8732) and cast to Long
            label = label.view(N, -1).long()
            loss = loss_func(ploc, plabel, bbox, label)

            if np.isfinite(loss.item()):
                avg_loss = 0.999 * avg_loss + 0.001 * loss.item()
            else:
                print("model exploded (corrupted by Inf or Nan)")
                sys.exit()

            num_elapsed_samples += N
            if args.rank == 0 and iter_num % args.print_interval == 0:
                end_elapsed_time = time.time()
                elapsed_time = end_elapsed_time - start_elapsed_time

                avg_samples_per_sec = num_elapsed_samples * args.N_gpu / elapsed_time

                print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, avg. samples / sec: {:.2f}"\
                            .format(iter_num, loss.item(), avg_loss, avg_samples_per_sec), end="\n")

                last_printed_iter = iter_num
                start_elapsed_time = time.time()
                num_elapsed_samples = 0

            with apex.amp.scale_loss(loss, optim) as scaled_loss:
                scaled_loss.backward()

            if not args.profile_fake_optim:
                optim.step()

            # Likely a decent skew here, let's take this opportunity to set the
            # gradients to None.  After DALI integration, playing with the
            # placement of this is worth trying.
            for p in ssd300.parameters():
                p.grad = None

            # Don't check every iteration due to cost of broadcast
            if iter_num % 20 == 0:
                finished = check_async_evals(args, evaluator, args.threshold)

                if finished:
                    return True

            iter_num += 1

        train_loader.reset()
        log_end(key=constants.EPOCH_STOP, metadata={'epoch_num': epoch + 1})

    return False
def main():
    args = parse_arguments()
    status = 'aborted'  # later set to 'success' if termination criteria met

    mlperf_logger.log_start(key=mlperf_logger.constants.INIT_START,
                            log_all_ranks=True, sync=False)

    if args.use_env and 'LOCAL_RANK' in os.environ:
        args.local_rank = int(os.environ['LOCAL_RANK'])
    
    device, args = setup_training(args)

    mlperf_logger.mlperf_submission_log('bert')

    worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.num_epochs_to_generate_seeds_for, device)
    worker_seed = worker_seeds[torch.distributed.get_rank()]
        
    random.seed(worker_seed)
    np.random.seed(worker_seed)
    torch.manual_seed(worker_seed)
    worker_init = WorkerInitObj(worker_seed)

    mlperf_logger.log_event(key=mlperf_logger.constants.SEED, value=args.seed,
                            sync=False)
    mlperf_logger.log_event(key=mlperf_logger.constants.GLOBAL_BATCH_SIZE,
                            value=global_batch_size(args), sync=False)
    mlperf_logger.log_event(key='opt_gradient_accumulation_steps',
                            value=args.gradient_accumulation_steps, sync=False)
    mlperf_logger.log_event(key='max_predictions_per_seq',
                            value=args.max_predictions_per_seq, sync=False)
    mlperf_logger.log_event(key='opt_learning_rate_training_steps',
                            value=args.max_steps, sync=False)
    mlperf_logger.log_event(key='num_warmup_steps',
                            value=int(args.warmup_proportion*args.max_steps) if args.warmup_steps==0 else args.warmup_steps,
                            sync=False)

    if utils.is_main_process():
        print("parsed args:")
        print(args)
    # Prepare optimizer
    model, optimizer, lr_scheduler, checkpoint, global_step = prepare_model_and_optimizer(args, device)
    samples_trained = global_step * args.train_batch_size * args.gradient_accumulation_steps * args.n_gpu

    if args.unpad:
        torch.cuda.synchronize()
        InitMHACUDAExtension()
        torch.cuda.synchronize()

    final_loss = float("inf")
    train_time_raw = float("inf")
    raw_train_start = time.time()

    if args.do_train:
        model.train()
        most_recent_ckpts_paths = []
        average_loss = 0.0  # averaged loss every args.log_freq steps
        epoch = 1
        training_steps = 0
        end_training, converged = False, False
        samples_trained_prev = 0
        eval_count = 0

        pool = ProcessPoolExecutor(1)

        if args.target_mlm_accuracy:
            if args.train_mlm_accuracy_window_size > 0:
                accuracy_scores = []
                avg_mlm_accuracy = torch.Tensor([0]).cuda()
        

        first_epoch = True
        if found_resume_checkpoint(args):
            f_start_id = checkpoint['files'][0]
            files = checkpoint['files'][1:]
            num_files = len(files)
        else:
            files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
                     os.path.isfile(os.path.join(args.input_dir, f)) and 'part' in f]
            files.sort()
            num_files = len(files)
            random.Random(shuffling_seeds[epoch]).shuffle(files)
            f_start_id = 0

        mlperf_logger.log_end(key=mlperf_logger.constants.INIT_STOP, sync=False)
        mlperf_logger.log_start(key=mlperf_logger.constants.RUN_START, sync=True)
        mlperf_logger.barrier()

        # Start prefetching eval dataset
        if args.eval_dir:
            eval_dataset_future = pool.submit(create_eval_dataset, args, worker_init_fn=worker_init)

        while global_step < args.max_steps and not end_training:
            mlperf_logger.log_start(key=mlperf_logger.constants.EPOCH_START,
                                    metadata={'epoch_num': epoch}, sync=False)
            mlperf_logger.log_start(key=mlperf_logger.constants.BLOCK_START,
                                    metadata={'first_epoch_num': epoch,
                                              'epoch_count': 1},
                                    sync=False)
            if utils.is_main_process():
                print("parsed args:")
                print(args)

                now_time = time.time()
                now_step = global_step
                now_skipped = skipped_steps

                print("epoch:", epoch)

            thread = None

            # Reshuffle file list on subsequent epochs
            if not first_epoch:
                files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
                         os.path.isfile(os.path.join(args.input_dir, f)) and 'part' in f]
                files.sort()
                num_files = len(files)
                random.Random(shuffling_seeds[epoch]).shuffle(files)
                f_start_id = 0

            first_epoch = False

            shared_file_list = {}

            if torch.distributed.is_initialized() and torch.distributed.get_world_size() > num_files:
                remainder = torch.distributed.get_world_size() % num_files
                data_file = files[(f_start_id*torch.distributed.get_world_size() + torch.distributed.get_rank() +
                                   remainder * f_start_id) % num_files]
            else:
                data_file = files[(f_start_id*torch.distributed.get_world_size() + torch.distributed.get_rank()) % num_files]

            previous_file = data_file

            train_data = pretraining_dataset(data_file, args.max_predictions_per_seq)
            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(train_data, sampler=train_sampler,
                                          batch_size=args.train_batch_size, num_workers=4, worker_init_fn=worker_init, pin_memory=True)

            overflow_buf = None
            if args.allreduce_post_accumulation:
                overflow_buf = torch.cuda.IntTensor([0])

            for f_id in range(f_start_id + 1, len(files)):
                if torch.distributed.get_world_size() > num_files:
                    data_file = files[(f_id*torch.distributed.get_world_size() + torch.distributed.get_rank() +
                                       remainder * f_id) % num_files]
                else:
                    data_file = files[(f_id*torch.distributed.get_world_size() + torch.distributed.get_rank())%num_files]

                previous_file = data_file

                dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init_fn=worker_init)

                for step, batch in enumerate(train_dataloader):
                    training_steps += 1
                    update_step = training_steps % args.gradient_accumulation_steps == 0

                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
                    loss, mlm_acc, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask,
                                    masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels,
                                    checkpoint_activations=args.checkpoint_activations)

                    divisor = args.gradient_accumulation_steps
                    if args.gradient_accumulation_steps > 1:
                        if not args.allreduce_post_accumulation:
                            # this division was merged into predivision
                            loss = loss / args.gradient_accumulation_steps
                            divisor = 1.0
                    if args.fp16:
                        with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()
                    average_loss += loss.item()

                    if update_step:
                        lr_scheduler.step()  # learning rate warmup
                        global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
                        samples_trained = global_step * args.train_batch_size * args.gradient_accumulation_steps * args.n_gpu

                        if (args.eval_dir and args.eval_iter_samples > 0 and
                            samples_trained >= args.eval_iter_start_samples + eval_count * args.eval_iter_samples):

                            # on first eval, get eval_dataloader
                            if eval_count == 0:
                               eval_dataloader = eval_dataset_future.result(timeout=None)

                            samples_trained_prev = samples_trained
                            eval_avg_loss, eval_avg_mlm_accuracy = run_eval(model, eval_dataloader, device, args.num_eval_examples,
                                                                            first_eval=(eval_count == 0), use_cache=args.cache_eval_data)
                            if utils.is_main_process():
                                mlperf_logger.log_event(key=mlperf_logger.constants.EVAL_ACCURACY, value=eval_avg_mlm_accuracy, metadata={'epoch_num': epoch}, sync=False)
                                print({"global_steps": global_step, "eval_loss": eval_avg_loss, "eval_mlm_accuracy":eval_avg_mlm_accuracy})

                            if args.target_mlm_accuracy:
                                if eval_avg_mlm_accuracy >= args.target_mlm_accuracy:
                                    end_training, converged = True, True
                                    if utils.is_main_process():
                                        print("%f > %f, Target MLM Accuracy reached at %d"%(eval_avg_mlm_accuracy, args.target_mlm_accuracy, global_step))

                            eval_count += 1

                    if args.target_mlm_accuracy and args.train_mlm_accuracy_window_size > 0:
                        accuracy_scores.append(mlm_acc)
                        if update_step:
                            accuracy_scores = accuracy_scores[-args.train_mlm_accuracy_window_size * args.gradient_accumulation_steps:]
                            avg_mlm_accuracy[0] = sum(accuracy_scores) / len(accuracy_scores)
                            torch.distributed.all_reduce(avg_mlm_accuracy, op=torch.distributed.ReduceOp.SUM)
                            avg_mlm_accuracy /= torch.distributed.get_world_size()

                    if training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
                        samples_trained = global_step * args.train_batch_size * args.gradient_accumulation_steps * args.n_gpu
                        if utils.is_main_process():
                            time_interval = time.time() - now_time
                            step_interval = global_step - now_step
                            skip_interval = skipped_steps - now_skipped
                            now_time = time.time()
                            now_step = global_step
                            now_skipped = skipped_steps
                            training_perf = args.train_batch_size * args.gradient_accumulation_steps * args.n_gpu \
                                            * (step_interval + skip_interval) / time_interval
                            
                            if args.train_mlm_accuracy_window_size > 0:
                                print({"training_steps": training_steps,
                                      "average_loss": average_loss / (args.log_freq * divisor),
                                      "step_loss": loss.item() * args.gradient_accumulation_steps / divisor,
                                      "learning_rate": optimizer.param_groups[0]['lr'],
                                      "seq/s": training_perf,
                                      "global_steps": now_step,
                                      "samples_trained": samples_trained,
                                      "skipped_steps": now_skipped,
                                      "timestamp": now_time,
                                      "mlm_accuracy": avg_mlm_accuracy[0].item()})
                            else:
                                print({"training_steps": training_steps,
                                      "average_loss": average_loss / (args.log_freq * divisor),
                                      "step_loss": loss.item() * args.gradient_accumulation_steps / divisor,
                                      "learning_rate": optimizer.param_groups[0]['lr'],
                                      "seq/s": training_perf,
                                      "global_steps": now_step,
                                      "samples_trained": samples_trained,
                                      "skipped_steps": now_skipped,
                                      "timestamp": now_time})

                        average_loss = 0

                    if global_step >= args.max_steps or end_training:
                        status = 'success' if converged else 'aborted'
                        end_training = True
                        train_time_raw = time.time() - raw_train_start
                        last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
                        last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
                        average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda()
                        average_loss = average_loss / (last_num_steps * divisor)
                        if (torch.distributed.is_initialized()):
                            average_loss /= torch.distributed.get_world_size()
                            torch.distributed.all_reduce(average_loss)
                        final_loss = average_loss.item()
                        if utils.is_main_process():
                            if args.train_mlm_accuracy_window_size > 0:
                                print((epoch, training_steps / args.gradient_accumulation_steps, ), {"final_loss": final_loss,
                                    "final_mlm_accuracy": avg_mlm_accuracy[0].item()})
                            else:
                                print((epoch, training_steps / args.gradient_accumulation_steps, ), {"final_loss": final_loss})

                    if end_training or (samples_trained - samples_trained_prev >= args.num_samples_per_checkpoint and samples_trained >= args.min_samples_to_start_checkpoints):
                        samples_trained_prev = samples_trained
                        if utils.is_main_process() and not args.skip_checkpoint:
                            # Save a trained model
                            model_to_save = model.module if hasattr(model,
                                                                    'module') else model  # Only save the model it-self
                            if args.phase2:
                                output_save_file = os.path.join(args.output_dir, "phase2_ckpt_{}.pt".format(samples_trained))
                            else:
                                output_save_file = os.path.join(args.output_dir, "phase1_ckpt_{}.pt".format(samples_trained))
                            if args.do_train:
                                torch.save({'model': model_to_save.state_dict(),
                                            'optimizer': optimizer.state_dict(),
                                            'master params': list(amp.master_params(optimizer)),
                                            'files': [f_id] + files}, output_save_file)

                                most_recent_ckpts_paths.append(output_save_file)
                                if len(most_recent_ckpts_paths) > args.keep_n_most_recent_checkpoints:
                                    ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
                                    os.remove(ckpt_to_be_removed)
                            
                        if samples_trained >= args.max_samples_termination or end_training:
                            status = 'success' if converged else 'aborted' 
                            end_training = True
                            break

                del train_dataloader

                if samples_trained >= args.max_samples_termination or end_training:
                    status = 'success' if converged else 'aborted'
                    end_training = True
                    break

                train_dataloader, data_file = dataset_future.result(timeout=None)

            mlperf_logger.log_end(key=mlperf_logger.constants.BLOCK_STOP,
                                  metadata={'first_epoch_num': epoch},
                                  sync=False)
            mlperf_logger.log_end(key=mlperf_logger.constants.EPOCH_STOP,
                                  metadata={'epoch_num': epoch}, sync=False)
            epoch += 1
        
        mlperf_logger.log_event(key=mlperf_logger.constants.TRAIN_SAMPLES,
                                value=samples_trained,
                                sync=False)
        mlperf_logger.log_event(key=mlperf_logger.constants.EVAL_SAMPLES,
                                value=args.num_eval_examples,
                                sync=False)
        mlperf_logger.log_end(key=mlperf_logger.constants.RUN_STOP,
                              metadata={'status': status}, sync=False)

    return args, final_loss, train_time_raw
def prepare_model_and_optimizer(args, device):
    global_step = 0
    args.resume_step = 0
    checkpoint = None

    config = BertConfig.from_json_file(args.bert_config_path)
    config.fused_mha = args.fused_mha
    config.fused_gelu_bias = args.fused_gelu_bias
    config.dense_seq_output = args.dense_seq_output
    config.unpad = args.unpad
    config.pad = args.pad
    config.fuse_qkv = not args.disable_fuse_qkv
    config.fuse_scale = not args.disable_fuse_scale
    config.fuse_mask = not args.disable_fuse_mask
    config.fuse_dropout = args.enable_fuse_dropout
    config.apex_softmax = not args.disable_apex_softmax
    config.enable_stream = args.enable_stream
    if config.fuse_mask == True: config.apex_softmax = True
    if config.pad == False: config.enable_stream = True
    if config.unpad == True: config.fused_mha = False

    # Padding for divisibility by 8
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)

    # Load from Pyt checkpoint - either given as init_checkpoint, or picked up from output_dir if found
    if args.init_checkpoint is not None or found_resume_checkpoint(args):
        # Prepare model

        model = BertForPreTraining(config)
        if args.init_checkpoint is None: # finding checkpoint in output_dir
            checkpoint_str = "phase2_ckpt_*.pt" if args.phase2 else "phase1_ckpt_*.pt"
            model_names = [f for f in glob.glob(os.path.join(args.output_dir, checkpoint_str))]
            global_step = max([int(x.split('.pt')[0].split('_')[-1].strip()) for x in model_names])
            args.resume_step = global_step #used for throughput computation

            resume_init_checkpoint = os.path.join(args.output_dir, checkpoint_str.replace("*", str(global_step)))
            print("Setting init checkpoint to %s - which is the latest in %s" %(resume_init_checkpoint, args.output_dir))
            checkpoint=torch.load(resume_init_checkpoint, map_location="cpu")
        else:
            checkpoint=torch.load(args.init_checkpoint, map_location="cpu")["model"]

        # Fused MHA requires a remapping of checkpoint parameters
        if config.fused_mha:
            checkpoint_remapped = remap_attn_parameters(checkpoint)
            model.load_state_dict(checkpoint_remapped, strict=False)
        else:
            model.load_state_dict(checkpoint, strict=True)
    else: #Load from TF Checkpoint
        model = BertForPreTraining.from_pretrained(args.init_tf_checkpoint, from_tf=True, config=config)


    model.to(device)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay_rate},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    mlperf_logger.log_event(key=mlperf_logger.constants.OPT_BASE_LR,
                            value=args.learning_rate, sync=False)
    optimizer = FusedLAMB(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          betas=(args.opt_lamb_beta_1, args.opt_lamb_beta_2))
    mlperf_logger.log_event(key='opt_epsilon', value=optimizer.defaults['eps'],
                            sync=False)
    b1, b2 = optimizer.defaults['betas']
    mlperf_logger.log_event(key='opt_lamb_beta_1', value=b1, sync=False)
    mlperf_logger.log_event(key='opt_lamb_beta_2', value=b2, sync=False)
    mlperf_logger.log_event(key='opt_lamb_weight_decay_rate',
                            value=optimizer.defaults['weight_decay'],
                            sync=False)

    if args.warmup_steps == 0:
        warmup_steps = int(args.max_steps * args.warmup_proportion)
        warmup_start = 0
    else:
        warmup_steps = args.warmup_steps
        warmup_start = args.start_warmup_step
    lr_scheduler = LinearWarmupPolyDecayScheduler(optimizer, start_warmup_steps=warmup_start, warmup_steps=warmup_steps,
                                                  total_steps=args.max_steps, end_learning_rate=0.0, degree=1.0)
    
                           
    if args.fp16:

        if args.loss_scale == 0:
            model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
        else:
            model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale)
        amp._amp_state.loss_scalers[0]._loss_scale = float(os.getenv("INIT_LOSS_SCALE", 2**20))


    if found_resume_checkpoint(args):
        optimizer.load_state_dict(checkpoint['optimizer']) #restores m,v states (only if resuming checkpoint, not for init_checkpoint and init_tf_checkpoint for now)

        # Restore AMP master parameters          
        if args.fp16:
            optimizer._lazy_init_maybe_master_weights()
            optimizer._amp_stash.lazy_init_called = True
            optimizer.load_state_dict(checkpoint['optimizer'])
            for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']):
                param.data.copy_(saved_param.data)

    if args.local_rank != -1:
        if not args.allreduce_post_accumulation:
            model = DDP(model, message_size=250000000, gradient_predivide_factor=torch.distributed.get_world_size())
        else:
            flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) )

    return model, optimizer, lr_scheduler, checkpoint, global_step
Пример #11
0
    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
        self.warmup = warmup
        self.total_steps = total_steps
        super(LinearWarmUpScheduler, self).__init__(optimizer, last_epoch)

        mlperf_logger.log_event(key=mlperf_logger.constants.OPT_LR_WARMUP_STEPS, value=total_steps*warmup, sync=False)