def build_eval_pipe(args): # Paths val_annotate = os.path.join(args.data, "annotations/bbox_only_instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") input_size = args.input_size val_trans = SSDTransformer((input_size, input_size), val=True) cocoGt = COCO(annotation_file=val_annotate, use_ext=True) val_coco = COCODetection(val_coco_root, val_annotate, val_trans, cocoGt.dataset) log_event(key=constants.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: val_sampler = GeneralDistributedSampler(val_coco, pad=False) else: val_sampler = None val_dataloader = DataLoader(val_coco, batch_size=args.eval_batch_size, shuffle=False, # Note: distributed sampler is shuffled :( sampler=val_sampler, num_workers=args.num_workers) inv_map = {v:k for k,v in val_coco.label_map.items()} return val_dataloader, inv_map, cocoGt
def mlperf_submission_log(benchmark): num_nodes = os.environ.get('SLURM_JOB_NUM_NODES', 1) configure_logger(benchmark) log_event( key=constants.SUBMISSION_BENCHMARK, value=benchmark, ) log_event( key=constants.SUBMISSION_ORG, value='Fujitsu') log_event( key=constants.SUBMISSION_DIVISION, value='closed') log_event( key=constants.SUBMISSION_STATUS, value='onprem') log_event( key=constants.SUBMISSION_PLATFORM, value=f'1xGX2570M5')
def __init__(self): self.sample_options = ( # Do nothing None, # min IoU, max IoU (0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), # no IoU requirements (None, None), ) # Implementation uses 1 iteration to find a possible candidate, this # was shown to produce the same mAP as using more iterations. self.num_cropping_iterations = 1 log_event(key=constants.MAX_SAMPLES, value=self.num_cropping_iterations)
def check_async_evals(args, evaluator, threshold): finished = 0 # Note: only one rank does COCOEval, so we need to check there if we've # finished -- we'll broadcast that to a "finished" tensor to determine # if we should stop # Note2: ssd_print contains a barrier() call, implemented with all_reduce # If we conditional on rank 0, then an ssd_print all_reduce matches with # the finished all_reduce and all hell breaks loose. if args.rank == 0: for epoch, current_accuracy in evaluator.finished_tasks().items(): # Note: Move to per-iter check # EVAL_START should be prior to the accuracy/score evaluation but adding the missing EVAL_START here for now log_start(key=constants.EVAL_START, metadata={'epoch_num': epoch + 1}) log_event(key=constants.EVAL_ACCURACY, value=current_accuracy, metadata={'epoch_num': epoch + 1}) log_end(key=constants.EVAL_STOP, metadata={'epoch_num': epoch + 1}) if current_accuracy >= threshold: finished = 1 # handle the non-distributed case -- don't need to bcast, just take local result if not args.distributed: return finished == 1 # Now we know from all ranks if they're done - reduce result # Note: Already caught the non-distributed case above, can assume broadcast is available with torch.no_grad(): finish_tensor = torch.tensor([finished], dtype=torch.int32, device=torch.device('cuda')) # torch.distributed.all_reduce(finish_tensor) torch.distributed.broadcast(finish_tensor, src=0) # >= 1 ranks has seen final accuracy if finish_tensor.item() >= 1: return True # Default case: No results, or no accuracte enough results return False
def build_pipeline(args, training=True, pipe=None): # Handle training / testing differently due to different # outputs. But still want to do this to abstract out the # use of EncodingInputIterator and RateMatcher if training: builder_fn = build_dali_pipeline if args.dali else build_native_pipeline train_loader, epoch_size = builder_fn(args, training=True, pipe=pipe) log_event(key=constants.TRAIN_SAMPLES, value=epoch_size) train_loader = ConvertDaliInputIterator(train_loader) if args.fake_input: train_loader = FakeInputIterator(train_loader, epoch_size, args.N_gpu) if args.input_batch_multiplier > 1: train_loader = RateMatcher(input_it=train_loader, output_size=args.batch_size) return train_loader, epoch_size else: return build_native_pipeline(args, training=False)
def __init__(self, optimizer, start_warmup_steps, warmup_steps, total_steps, end_learning_rate=0.0, degree=1.0, last_epoch=-1): self.num_warmup_updates = warmup_steps self.start_warmup_steps = start_warmup_steps self.total_steps = total_steps self.end_learning_rate = end_learning_rate self.degree = degree super(LinearWarmupPolyDecayScheduler, self).__init__(optimizer, last_epoch) mlperf_logger.log_event(key=mlperf_logger.constants.OPT_LR_WARMUP_STEPS, value=self.num_warmup_updates, sync=False) mlperf_logger.log_event(key='opt_lamb_learning_rate_decay_poly_power', value=degree, sync=False) mlperf_logger.log_event(key='start_warmup_step', value=self.start_warmup_steps, sync=False)
def make_criteo_data_and_loaders(args, offset_to_length_converter=False): if args.mlperf_logging and args.memory_map and args.data_set == "terabyte": # more efficient for larger batches data_directory = path.dirname(args.raw_data_file) if args.mlperf_bin_loader: lstr = args.processed_data_file.split("/") d_path = "/".join(lstr[0:-1]) + "/" + lstr[-1].split(".")[0] train_file = d_path + "_train.bin" test_file = d_path + "_test.bin" # val_file = d_path + "_val.bin" counts_file = args.raw_data_file + '_fea_count.npz' if any(not path.exists(p) for p in [train_file, test_file, counts_file]): ensure_dataset_preprocessed(args, d_path) train_data = data_loader_terabyte.CriteoBinDataset( data_file=train_file, counts_file=counts_file, batch_size=args.mini_batch_size, max_ind_range=args.max_ind_range ) mlperf_logger.log_event(key=mlperf_logger.constants.TRAIN_SAMPLES, value=train_data.num_samples) train_loader = torch.utils.data.DataLoader( train_data, batch_size=None, batch_sampler=None, shuffle=False, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, sampler=RandomSampler(train_data) if args.mlperf_bin_shuffle else None ) test_data = data_loader_terabyte.CriteoBinDataset( data_file=test_file, counts_file=counts_file, batch_size=args.test_mini_batch_size, max_ind_range=args.max_ind_range ) mlperf_logger.log_event(key=mlperf_logger.constants.EVAL_SAMPLES, value=test_data.num_samples) test_loader = torch.utils.data.DataLoader( test_data, batch_size=None, batch_sampler=None, shuffle=False, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, ) else: data_filename = args.raw_data_file.split("/")[-1] train_data = CriteoDataset( args.data_set, args.max_ind_range, args.data_sub_sample_rate, args.data_randomize, "train", args.raw_data_file, args.processed_data_file, args.memory_map, args.dataset_multiprocessing ) test_data = CriteoDataset( args.data_set, args.max_ind_range, args.data_sub_sample_rate, args.data_randomize, "test", args.raw_data_file, args.processed_data_file, args.memory_map, args.dataset_multiprocessing ) train_loader = data_loader_terabyte.DataLoader( data_directory=data_directory, data_filename=data_filename, days=list(range(23)), batch_size=args.mini_batch_size, max_ind_range=args.max_ind_range, split="train" ) test_loader = data_loader_terabyte.DataLoader( data_directory=data_directory, data_filename=data_filename, days=[23], batch_size=args.test_mini_batch_size, max_ind_range=args.max_ind_range, split="test" ) else: train_data = CriteoDataset( args.data_set, args.max_ind_range, args.data_sub_sample_rate, args.data_randomize, "train", args.raw_data_file, args.processed_data_file, args.memory_map, args.dataset_multiprocessing, ) test_data = CriteoDataset( args.data_set, args.max_ind_range, args.data_sub_sample_rate, args.data_randomize, "test", args.raw_data_file, args.processed_data_file, args.memory_map, args.dataset_multiprocessing, ) collate_wrapper_criteo = collate_wrapper_criteo_offset if offset_to_length_converter: collate_wrapper_criteo = collate_wrapper_criteo_length train_loader = torch.utils.data.DataLoader( train_data, batch_size=args.mini_batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_wrapper_criteo, pin_memory=False, drop_last=False, # True ) test_loader = torch.utils.data.DataLoader( test_data, batch_size=args.test_mini_batch_size, shuffle=False, num_workers=args.test_num_workers, collate_fn=collate_wrapper_criteo, pin_memory=False, drop_last=False, # True ) return train_data, train_loader, test_data, test_loader
def train300_mlperf_coco(args): args = setup_distributed(args) # Build the model model_options = { 'use_nhwc': args.nhwc, 'pad_input': args.pad_input, 'bn_group': args.bn_group, } ssd300 = SSD300(args, args.num_classes, **model_options) if args.checkpoint is not None: load_checkpoint(ssd300, args.checkpoint) ssd300.train() ssd300.cuda() dboxes = dboxes300_coco() # Note: No reason not to use optimised loss loss_func = OptLoss() loss_func.cuda() # Create optimizer. This must also be done after network_to_half. global_batch_size = (args.N_gpu * args.batch_size) log_event(key=constants.MODEL_BN_SPAN, value=args.bn_group * args.batch_size) log_event(key=constants.GLOBAL_BATCH_SIZE, value=global_batch_size) # mlperf only allows base_lr scaled by an integer base_lr = 2.5e-3 requested_lr_multiplier = args.lr / base_lr adjusted_multiplier = max( 1, round(requested_lr_multiplier * global_batch_size / 32)) current_lr = base_lr * adjusted_multiplier current_momentum = 0.9 current_weight_decay = args.wd static_loss_scale = 128. optim = apex.optimizers.FusedSGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) ssd300, optim = apex.amp.initialize(ssd300, optim, opt_level='O2', loss_scale=static_loss_scale) # Parallelize. Need to do this after network_to_half. if args.distributed: if args.delay_allreduce: print_message(args.local_rank, "Delaying allreduces to the end of backward()") ssd300 = DDP(ssd300, gradient_predivide_factor=args.N_gpu / 8.0, delay_allreduce=args.delay_allreduce, retain_allreduce_buffers=args.use_fp16) log_event(key=constants.OPT_BASE_LR, value=current_lr) log_event(key=constants.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_epochs) log_event(key=constants.OPT_LR_DECAY_STEPS, value=args.lr_decay_epochs) log_event(key=constants.OPT_WEIGHT_DECAY, value=current_weight_decay) if args.warmup is not None: log_event(key=constants.OPT_LR_WARMUP_STEPS, value=args.warmup) log_event(key=constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) # Model is completely finished -- need to create separate copies, preserve parameters across # them, and jit ssd300_eval = SSD300(args, args.num_classes, **model_options).cuda() if args.use_fp16: convert_network(ssd300_eval, torch.half) # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 ssd300_eval.load_state_dict(train_model.state_dict()) ssd300_eval.eval() print_message(args.local_rank, "epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 start_elapsed_time = time.time() last_printed_iter = args.iteration num_elapsed_samples = 0 input_c = 4 if args.pad_input else 3 example_shape = [args.batch_size, 300, 300, input_c ] if args.nhwc else [args.batch_size, input_c, 300, 300] example_input = torch.randn(*example_shape).cuda() if args.use_fp16: example_input = example_input.half() if args.jit: # DDP has some Python-side control flow. If we JIT the entire DDP-wrapped module, # the resulting ScriptModule will elide this control flow, resulting in allreduce # hooks not being called. If we're running distributed, we need to extract and JIT # the wrapped .module. # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks # to go out of scope, and therefore silently disappear. module_to_jit = ssd300.module if args.distributed else ssd300 if args.distributed: ssd300.module = torch.jit.trace(module_to_jit, example_input, check_trace=False) else: ssd300 = torch.jit.trace(module_to_jit, example_input, check_trace=False) # JIT the eval model too ssd300_eval = torch.jit.trace(ssd300_eval, example_input, check_trace=False) # do a dummy fprop & bprop to make sure cudnnFind etc. are timed here ploc, plabel = ssd300(example_input) # produce a single dummy "loss" to make things easier loss = ploc[0, 0, 0] + plabel[0, 0, 0] dloss = torch.randn_like(loss) # Cause cudnnFind for dgrad, wgrad to run loss.backward(dloss) # Necessary import in init from pycocotools.coco import COCO encoder = build_ssd300_coder() evaluator = AsyncEvaluator(num_threads=1) log_end(key=constants.INIT_STOP) ##### END INIT # This is the first place we touch anything related to data ##### START DATA TOUCHING barrier() log_start(key=constants.RUN_START) barrier() train_pipe = prebuild_pipeline(args) train_loader, epoch_size = build_pipeline(args, training=True, pipe=train_pipe) if args.rank == 0: print("epoch size is: ", epoch_size, " images") val_loader, inv_map, cocoGt = build_pipeline(args, training=False) if args.profile_gc_off: gc.disable() gc.collect() ##### END DATA TOUCHING i_eval = 0 block_start_epoch = 1 log_start(key=constants.BLOCK_START, metadata={ 'first_epoch_num': block_start_epoch, 'epoch_count': args.evaluation[i_eval] }) for epoch in range(args.epochs): for p in ssd300.parameters(): p.grad = None if epoch in args.evaluation: # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 if args.distributed and args.allreduce_running_stats: if args.rank == 0: print("averaging bn running means and vars") # make sure every node has the same running bn stats before # using them to evaluate, or saving the model for inference world_size = float(torch.distributed.get_world_size()) for bn_name, bn_buf in train_model.named_buffers(recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size if args.rank == 0: if args.save: print("saving model...") if not os.path.isdir('./models'): os.mkdir('./models') torch.save({"model": ssd300.state_dict()}, "./models/iter_{}.pt".format(iter_num)) ssd300_eval.load_state_dict(train_model.state_dict()) # Note: No longer returns, evaluation is abstracted away inside evaluator coco_eval(args, ssd300_eval, val_loader, cocoGt, encoder, inv_map, epoch, iter_num, evaluator=evaluator) log_end(key=constants.BLOCK_STOP, metadata={'first_epoch_num': block_start_epoch}) if epoch != max(args.evaluation): i_eval += 1 block_start_epoch = epoch + 1 log_start(key=constants.BLOCK_START, metadata={ 'first_epoch_num': block_start_epoch, 'epoch_count': (args.evaluation[i_eval] - args.evaluation[i_eval - 1]) }) if epoch in args.lr_decay_epochs: current_lr *= args.lr_decay_factor print_message( args.rank, "lr decay step #" + str(bisect(args.lr_decay_epochs, epoch))) for param_group in optim.param_groups: param_group['lr'] = current_lr log_start(key=constants.EPOCH_START, metadata={ 'epoch_num': epoch + 1, 'current_iter_num': iter_num }) for i, (img, bbox, label) in enumerate(train_loader): if args.profile_start is not None and iter_num == args.profile_start: torch.cuda.profiler.start() torch.cuda.synchronize() if args.profile_nvtx: torch.autograd._enable_profiler( torch.autograd.ProfilerState.NVTX) if args.profile is not None and iter_num == args.profile: if args.profile_start is not None and iter_num >= args.profile_start: # we turned cuda and nvtx profiling on, better turn it off too if args.profile_nvtx: torch.autograd._disable_profiler() torch.cuda.profiler.stop() return if args.warmup is not None: lr_warmup(optim, args.warmup, iter_num, epoch, current_lr, args) if (img is None) or (bbox is None) or (label is None): print("No labels in batch") continue ploc, plabel = ssd300(img) ploc, plabel = ploc.float(), plabel.float() N = img.shape[0] bbox.requires_grad = False label.requires_grad = False # reshape (N*8732X4 -> Nx8732x4) and transpose (Nx8732x4 -> Nx4x8732) bbox = bbox.view(N, -1, 4).transpose(1, 2).contiguous() # reshape (N*8732 -> Nx8732) and cast to Long label = label.view(N, -1).long() loss = loss_func(ploc, plabel, bbox, label) if np.isfinite(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() else: print("model exploded (corrupted by Inf or Nan)") sys.exit() num_elapsed_samples += N if args.rank == 0 and iter_num % args.print_interval == 0: end_elapsed_time = time.time() elapsed_time = end_elapsed_time - start_elapsed_time avg_samples_per_sec = num_elapsed_samples * args.N_gpu / elapsed_time print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, avg. samples / sec: {:.2f}"\ .format(iter_num, loss.item(), avg_loss, avg_samples_per_sec), end="\n") last_printed_iter = iter_num start_elapsed_time = time.time() num_elapsed_samples = 0 with apex.amp.scale_loss(loss, optim) as scaled_loss: scaled_loss.backward() if not args.profile_fake_optim: optim.step() # Likely a decent skew here, let's take this opportunity to set the # gradients to None. After DALI integration, playing with the # placement of this is worth trying. for p in ssd300.parameters(): p.grad = None # Don't check every iteration due to cost of broadcast if iter_num % 20 == 0: finished = check_async_evals(args, evaluator, args.threshold) if finished: return True iter_num += 1 train_loader.reset() log_end(key=constants.EPOCH_STOP, metadata={'epoch_num': epoch + 1}) return False
def main(): args = parse_arguments() status = 'aborted' # later set to 'success' if termination criteria met mlperf_logger.log_start(key=mlperf_logger.constants.INIT_START, log_all_ranks=True, sync=False) if args.use_env and 'LOCAL_RANK' in os.environ: args.local_rank = int(os.environ['LOCAL_RANK']) device, args = setup_training(args) mlperf_logger.mlperf_submission_log('bert') worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.num_epochs_to_generate_seeds_for, device) worker_seed = worker_seeds[torch.distributed.get_rank()] random.seed(worker_seed) np.random.seed(worker_seed) torch.manual_seed(worker_seed) worker_init = WorkerInitObj(worker_seed) mlperf_logger.log_event(key=mlperf_logger.constants.SEED, value=args.seed, sync=False) mlperf_logger.log_event(key=mlperf_logger.constants.GLOBAL_BATCH_SIZE, value=global_batch_size(args), sync=False) mlperf_logger.log_event(key='opt_gradient_accumulation_steps', value=args.gradient_accumulation_steps, sync=False) mlperf_logger.log_event(key='max_predictions_per_seq', value=args.max_predictions_per_seq, sync=False) mlperf_logger.log_event(key='opt_learning_rate_training_steps', value=args.max_steps, sync=False) mlperf_logger.log_event(key='num_warmup_steps', value=int(args.warmup_proportion*args.max_steps) if args.warmup_steps==0 else args.warmup_steps, sync=False) if utils.is_main_process(): print("parsed args:") print(args) # Prepare optimizer model, optimizer, lr_scheduler, checkpoint, global_step = prepare_model_and_optimizer(args, device) samples_trained = global_step * args.train_batch_size * args.gradient_accumulation_steps * args.n_gpu if args.unpad: torch.cuda.synchronize() InitMHACUDAExtension() torch.cuda.synchronize() final_loss = float("inf") train_time_raw = float("inf") raw_train_start = time.time() if args.do_train: model.train() most_recent_ckpts_paths = [] average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 1 training_steps = 0 end_training, converged = False, False samples_trained_prev = 0 eval_count = 0 pool = ProcessPoolExecutor(1) if args.target_mlm_accuracy: if args.train_mlm_accuracy_window_size > 0: accuracy_scores = [] avg_mlm_accuracy = torch.Tensor([0]).cuda() first_epoch = True if found_resume_checkpoint(args): f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] num_files = len(files) else: files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'part' in f] files.sort() num_files = len(files) random.Random(shuffling_seeds[epoch]).shuffle(files) f_start_id = 0 mlperf_logger.log_end(key=mlperf_logger.constants.INIT_STOP, sync=False) mlperf_logger.log_start(key=mlperf_logger.constants.RUN_START, sync=True) mlperf_logger.barrier() # Start prefetching eval dataset if args.eval_dir: eval_dataset_future = pool.submit(create_eval_dataset, args, worker_init_fn=worker_init) while global_step < args.max_steps and not end_training: mlperf_logger.log_start(key=mlperf_logger.constants.EPOCH_START, metadata={'epoch_num': epoch}, sync=False) mlperf_logger.log_start(key=mlperf_logger.constants.BLOCK_START, metadata={'first_epoch_num': epoch, 'epoch_count': 1}, sync=False) if utils.is_main_process(): print("parsed args:") print(args) now_time = time.time() now_step = global_step now_skipped = skipped_steps print("epoch:", epoch) thread = None # Reshuffle file list on subsequent epochs if not first_epoch: files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'part' in f] files.sort() num_files = len(files) random.Random(shuffling_seeds[epoch]).shuffle(files) f_start_id = 0 first_epoch = False shared_file_list = {} if torch.distributed.is_initialized() and torch.distributed.get_world_size() > num_files: remainder = torch.distributed.get_world_size() % num_files data_file = files[(f_start_id*torch.distributed.get_world_size() + torch.distributed.get_rank() + remainder * f_start_id) % num_files] else: data_file = files[(f_start_id*torch.distributed.get_world_size() + torch.distributed.get_rank()) % num_files] previous_file = data_file train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4, worker_init_fn=worker_init, pin_memory=True) overflow_buf = None if args.allreduce_post_accumulation: overflow_buf = torch.cuda.IntTensor([0]) for f_id in range(f_start_id + 1, len(files)): if torch.distributed.get_world_size() > num_files: data_file = files[(f_id*torch.distributed.get_world_size() + torch.distributed.get_rank() + remainder * f_id) % num_files] else: data_file = files[(f_id*torch.distributed.get_world_size() + torch.distributed.get_rank())%num_files] previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init_fn=worker_init) for step, batch in enumerate(train_dataloader): training_steps += 1 update_step = training_steps % args.gradient_accumulation_steps == 0 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch loss, mlm_acc, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels, checkpoint_activations=args.checkpoint_activations) divisor = args.gradient_accumulation_steps if args.gradient_accumulation_steps > 1: if not args.allreduce_post_accumulation: # this division was merged into predivision loss = loss / args.gradient_accumulation_steps divisor = 1.0 if args.fp16: with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss: scaled_loss.backward() else: loss.backward() average_loss += loss.item() if update_step: lr_scheduler.step() # learning rate warmup global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step) samples_trained = global_step * args.train_batch_size * args.gradient_accumulation_steps * args.n_gpu if (args.eval_dir and args.eval_iter_samples > 0 and samples_trained >= args.eval_iter_start_samples + eval_count * args.eval_iter_samples): # on first eval, get eval_dataloader if eval_count == 0: eval_dataloader = eval_dataset_future.result(timeout=None) samples_trained_prev = samples_trained eval_avg_loss, eval_avg_mlm_accuracy = run_eval(model, eval_dataloader, device, args.num_eval_examples, first_eval=(eval_count == 0), use_cache=args.cache_eval_data) if utils.is_main_process(): mlperf_logger.log_event(key=mlperf_logger.constants.EVAL_ACCURACY, value=eval_avg_mlm_accuracy, metadata={'epoch_num': epoch}, sync=False) print({"global_steps": global_step, "eval_loss": eval_avg_loss, "eval_mlm_accuracy":eval_avg_mlm_accuracy}) if args.target_mlm_accuracy: if eval_avg_mlm_accuracy >= args.target_mlm_accuracy: end_training, converged = True, True if utils.is_main_process(): print("%f > %f, Target MLM Accuracy reached at %d"%(eval_avg_mlm_accuracy, args.target_mlm_accuracy, global_step)) eval_count += 1 if args.target_mlm_accuracy and args.train_mlm_accuracy_window_size > 0: accuracy_scores.append(mlm_acc) if update_step: accuracy_scores = accuracy_scores[-args.train_mlm_accuracy_window_size * args.gradient_accumulation_steps:] avg_mlm_accuracy[0] = sum(accuracy_scores) / len(accuracy_scores) torch.distributed.all_reduce(avg_mlm_accuracy, op=torch.distributed.ReduceOp.SUM) avg_mlm_accuracy /= torch.distributed.get_world_size() if training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0: samples_trained = global_step * args.train_batch_size * args.gradient_accumulation_steps * args.n_gpu if utils.is_main_process(): time_interval = time.time() - now_time step_interval = global_step - now_step skip_interval = skipped_steps - now_skipped now_time = time.time() now_step = global_step now_skipped = skipped_steps training_perf = args.train_batch_size * args.gradient_accumulation_steps * args.n_gpu \ * (step_interval + skip_interval) / time_interval if args.train_mlm_accuracy_window_size > 0: print({"training_steps": training_steps, "average_loss": average_loss / (args.log_freq * divisor), "step_loss": loss.item() * args.gradient_accumulation_steps / divisor, "learning_rate": optimizer.param_groups[0]['lr'], "seq/s": training_perf, "global_steps": now_step, "samples_trained": samples_trained, "skipped_steps": now_skipped, "timestamp": now_time, "mlm_accuracy": avg_mlm_accuracy[0].item()}) else: print({"training_steps": training_steps, "average_loss": average_loss / (args.log_freq * divisor), "step_loss": loss.item() * args.gradient_accumulation_steps / divisor, "learning_rate": optimizer.param_groups[0]['lr'], "seq/s": training_perf, "global_steps": now_step, "samples_trained": samples_trained, "skipped_steps": now_skipped, "timestamp": now_time}) average_loss = 0 if global_step >= args.max_steps or end_training: status = 'success' if converged else 'aborted' end_training = True train_time_raw = time.time() - raw_train_start last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda() average_loss = average_loss / (last_num_steps * divisor) if (torch.distributed.is_initialized()): average_loss /= torch.distributed.get_world_size() torch.distributed.all_reduce(average_loss) final_loss = average_loss.item() if utils.is_main_process(): if args.train_mlm_accuracy_window_size > 0: print((epoch, training_steps / args.gradient_accumulation_steps, ), {"final_loss": final_loss, "final_mlm_accuracy": avg_mlm_accuracy[0].item()}) else: print((epoch, training_steps / args.gradient_accumulation_steps, ), {"final_loss": final_loss}) if end_training or (samples_trained - samples_trained_prev >= args.num_samples_per_checkpoint and samples_trained >= args.min_samples_to_start_checkpoints): samples_trained_prev = samples_trained if utils.is_main_process() and not args.skip_checkpoint: # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self if args.phase2: output_save_file = os.path.join(args.output_dir, "phase2_ckpt_{}.pt".format(samples_trained)) else: output_save_file = os.path.join(args.output_dir, "phase1_ckpt_{}.pt".format(samples_trained)) if args.do_train: torch.save({'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'master params': list(amp.master_params(optimizer)), 'files': [f_id] + files}, output_save_file) most_recent_ckpts_paths.append(output_save_file) if len(most_recent_ckpts_paths) > args.keep_n_most_recent_checkpoints: ckpt_to_be_removed = most_recent_ckpts_paths.pop(0) os.remove(ckpt_to_be_removed) if samples_trained >= args.max_samples_termination or end_training: status = 'success' if converged else 'aborted' end_training = True break del train_dataloader if samples_trained >= args.max_samples_termination or end_training: status = 'success' if converged else 'aborted' end_training = True break train_dataloader, data_file = dataset_future.result(timeout=None) mlperf_logger.log_end(key=mlperf_logger.constants.BLOCK_STOP, metadata={'first_epoch_num': epoch}, sync=False) mlperf_logger.log_end(key=mlperf_logger.constants.EPOCH_STOP, metadata={'epoch_num': epoch}, sync=False) epoch += 1 mlperf_logger.log_event(key=mlperf_logger.constants.TRAIN_SAMPLES, value=samples_trained, sync=False) mlperf_logger.log_event(key=mlperf_logger.constants.EVAL_SAMPLES, value=args.num_eval_examples, sync=False) mlperf_logger.log_end(key=mlperf_logger.constants.RUN_STOP, metadata={'status': status}, sync=False) return args, final_loss, train_time_raw
def prepare_model_and_optimizer(args, device): global_step = 0 args.resume_step = 0 checkpoint = None config = BertConfig.from_json_file(args.bert_config_path) config.fused_mha = args.fused_mha config.fused_gelu_bias = args.fused_gelu_bias config.dense_seq_output = args.dense_seq_output config.unpad = args.unpad config.pad = args.pad config.fuse_qkv = not args.disable_fuse_qkv config.fuse_scale = not args.disable_fuse_scale config.fuse_mask = not args.disable_fuse_mask config.fuse_dropout = args.enable_fuse_dropout config.apex_softmax = not args.disable_apex_softmax config.enable_stream = args.enable_stream if config.fuse_mask == True: config.apex_softmax = True if config.pad == False: config.enable_stream = True if config.unpad == True: config.fused_mha = False # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) # Load from Pyt checkpoint - either given as init_checkpoint, or picked up from output_dir if found if args.init_checkpoint is not None or found_resume_checkpoint(args): # Prepare model model = BertForPreTraining(config) if args.init_checkpoint is None: # finding checkpoint in output_dir checkpoint_str = "phase2_ckpt_*.pt" if args.phase2 else "phase1_ckpt_*.pt" model_names = [f for f in glob.glob(os.path.join(args.output_dir, checkpoint_str))] global_step = max([int(x.split('.pt')[0].split('_')[-1].strip()) for x in model_names]) args.resume_step = global_step #used for throughput computation resume_init_checkpoint = os.path.join(args.output_dir, checkpoint_str.replace("*", str(global_step))) print("Setting init checkpoint to %s - which is the latest in %s" %(resume_init_checkpoint, args.output_dir)) checkpoint=torch.load(resume_init_checkpoint, map_location="cpu") else: checkpoint=torch.load(args.init_checkpoint, map_location="cpu")["model"] # Fused MHA requires a remapping of checkpoint parameters if config.fused_mha: checkpoint_remapped = remap_attn_parameters(checkpoint) model.load_state_dict(checkpoint_remapped, strict=False) else: model.load_state_dict(checkpoint, strict=True) else: #Load from TF Checkpoint model = BertForPreTraining.from_pretrained(args.init_tf_checkpoint, from_tf=True, config=config) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta', 'LayerNorm'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay_rate}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] mlperf_logger.log_event(key=mlperf_logger.constants.OPT_BASE_LR, value=args.learning_rate, sync=False) optimizer = FusedLAMB(optimizer_grouped_parameters, lr=args.learning_rate, betas=(args.opt_lamb_beta_1, args.opt_lamb_beta_2)) mlperf_logger.log_event(key='opt_epsilon', value=optimizer.defaults['eps'], sync=False) b1, b2 = optimizer.defaults['betas'] mlperf_logger.log_event(key='opt_lamb_beta_1', value=b1, sync=False) mlperf_logger.log_event(key='opt_lamb_beta_2', value=b2, sync=False) mlperf_logger.log_event(key='opt_lamb_weight_decay_rate', value=optimizer.defaults['weight_decay'], sync=False) if args.warmup_steps == 0: warmup_steps = int(args.max_steps * args.warmup_proportion) warmup_start = 0 else: warmup_steps = args.warmup_steps warmup_start = args.start_warmup_step lr_scheduler = LinearWarmupPolyDecayScheduler(optimizer, start_warmup_steps=warmup_start, warmup_steps=warmup_steps, total_steps=args.max_steps, end_learning_rate=0.0, degree=1.0) if args.fp16: if args.loss_scale == 0: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic") else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale) amp._amp_state.loss_scalers[0]._loss_scale = float(os.getenv("INIT_LOSS_SCALE", 2**20)) if found_resume_checkpoint(args): optimizer.load_state_dict(checkpoint['optimizer']) #restores m,v states (only if resuming checkpoint, not for init_checkpoint and init_tf_checkpoint for now) # Restore AMP master parameters if args.fp16: optimizer._lazy_init_maybe_master_weights() optimizer._amp_stash.lazy_init_called = True optimizer.load_state_dict(checkpoint['optimizer']) for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']): param.data.copy_(saved_param.data) if args.local_rank != -1: if not args.allreduce_post_accumulation: model = DDP(model, message_size=250000000, gradient_predivide_factor=torch.distributed.get_world_size()) else: flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) ) return model, optimizer, lr_scheduler, checkpoint, global_step
def __init__(self, optimizer, warmup, total_steps, last_epoch=-1): self.warmup = warmup self.total_steps = total_steps super(LinearWarmUpScheduler, self).__init__(optimizer, last_epoch) mlperf_logger.log_event(key=mlperf_logger.constants.OPT_LR_WARMUP_STEPS, value=total_steps*warmup, sync=False)