def generate_query_vectors(self, qa_data, split): self.eval_dataset = get_nq_dataset(qa_data, split) dataloader = get_one_epoch_nq_dataloader(self.eval_dataset) query_vectors = [] reference_list = [] for batch in dataloader: # batch also has query_tokens and query_pad_data query_tokens, query_mask, query_types, \ query_len, reference = process_nq_batch(batch) assert len(self.model) == 1 unwrapped_model = self.model[0] while not hasattr(unwrapped_model, 'embed_text'): unwrapped_model = unwrapped_model.module with torch.no_grad(): query_logits = unwrapped_model.embed_text( unwrapped_model.query_model, query_tokens, query_mask, query_types) reference_list.extend(reference) query_vectors.extend(query_logits.split(1, dim=0)) if len(query_vectors) % 100 == 0: print_rank_0('Encoded queries {}'.format(len(query_vectors))) query_tensor = torch.cat(query_vectors, dim=0) print_rank_0('Total encoded queries tensor {}'.format( query_tensor.size())) assert query_tensor.size(0) == len(self.eval_dataset) return query_tensor, reference_list
def __init__(self, optimizer, max_lr, min_lr, warmup_steps, decay_steps, decay_style, use_checkpoint_lr_scheduler=True, override_lr_scheduler=False): # Class values. self.optimizer = optimizer self.max_lr = float(max_lr) self.min_lr = min_lr assert self.min_lr >= 0.0 assert self.max_lr >= self.min_lr self.warmup_steps = warmup_steps self.num_steps = 0 self.decay_steps = decay_steps assert self.decay_steps > 0 assert self.warmup_steps < self.decay_steps self.decay_style = decay_style self.override_lr_scheduler = override_lr_scheduler self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler if self.override_lr_scheduler: assert not self.use_checkpoint_lr_scheduler, 'both override and '\ 'use-checkpoint are set.' # Set the learning rate self.step(0) print_rank_0('> learning rate decay style: {}'.format(self.decay_style))
def train_step(forward_step_func, data_iterator, model, optimizer, lr_scheduler): """Single training step.""" args = get_args() timers = get_timers() # Forward model for one step. timers('forward').start() loss, loss_reduced = forward_step_func(data_iterator, model) print_rank_0(f"Loss {loss} and reduced loss {loss_reduced}") timers('forward').stop() # Calculate gradients, reduce across processes, and clip. timers('backward').start() backward_step(optimizer, model, loss) timers('backward').stop() # Update parameters. skipped_iter = 0 timers('optimizer').start() if args.deepspeed: model.step() else: optimizer.step() # Update learning rate. if not (args.fp16 and optimizer.overflow): lr_scheduler.step() else: skipped_iter = 1 timers('optimizer').stop() return loss_reduced, skipped_iter
def get_learning_rate_scheduler(optimizer, neox_args): """Build the learning rate scheduler.""" if neox_args.no_load_optim: # TODO: this should be configured as a separate arg return None if neox_args.deepspeed and neox_args.optimizer_type.lower( ) == "onebitadam": print_rank_0( "WARNING: onebitadam requires the lr scheduler be built by deepspeed - " "Make sure one is added to your deepspeed config") return None # Add linear learning rate scheduler. if neox_args.lr_decay_iters is not None: num_iters = neox_args.lr_decay_iters else: num_iters = neox_args.train_iters num_iters = max(1, num_iters) init_step = 0 warmup_iter = neox_args.warmup * num_iters lr_scheduler = AnnealingLR( optimizer, start_lr=neox_args.lr, warmup_iter=warmup_iter, total_iters=num_iters, decay_style=neox_args.lr_decay_style, last_iter=init_step, min_lr=neox_args.min_lr, use_checkpoint_lr_scheduler=neox_args.use_checkpoint_lr_scheduler, override_lr_scheduler=neox_args.override_lr_scheduler, ) return lr_scheduler
def load_state_dict(self, state_dict): # Optimizer. optimizer_key = 'optimizer' if optimizer_key not in state_dict: optimizer_key = 'optimizer_state_dict' print_rank_0('***WARNING*** loading optimizer from ' 'an old checkpoint ...') self.optimizer.load_state_dict(state_dict[optimizer_key]) # Grad scaler. if 'grad_scaler' not in state_dict: print_rank_0('***WARNING*** found an old checkpoint, will not ' 'load grad scaler ...') else: self.grad_scaler.load_state_dict(state_dict['grad_scaler']) # Copy data for the main params. fp32_from_fp16_params_key = 'fp32_from_fp16_params' if fp32_from_fp16_params_key not in state_dict: fp32_from_fp16_params_key = 'fp32_from_fp16' for current_group, saved_group in zip( self.fp32_from_fp16_groups, state_dict[fp32_from_fp16_params_key]): for current_param, saved_param in zip(current_group, saved_group): current_param.data.copy_(saved_param.data)
def model_provider(): """Build the model.""" args = get_args() print_rank_0("building classification model for ImageNet ...") return VitModel(num_classes=args.num_classes, finetune=True)
def read_metadata(tracker_filename): # Read the tracker file and either set the iteration or # mark it as a release checkpoint. iteration = 0 release = False with open(tracker_filename, 'r') as f: metastring = f.read().strip() try: iteration = int(metastring) except ValueError: release = metastring == 'release' if not release: print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format( tracker_filename)) sys.exit() assert iteration > 0 or release, 'error parsing metadata file {}'.format( tracker_filename) # Get the max iteration retrieved across the ranks. iters_cuda = torch.cuda.LongTensor([iteration]) torch.distributed.all_reduce(iters_cuda, op=torch.distributed.ReduceOp.MAX) max_iter = iters_cuda[0].item() # We should now have all the same iteration. # If not, print a warning and chose the maximum # iteration across all ranks. if iteration != max_iter: print('WARNING: on rank {} found iteration {} in the ' 'metadata while max iteration across the ranks ' 'is {}, replacing it with max iteration.'.format( rank, iteration, max_iter), flush=True) return max_iter, release
def _build_train_valid_dataloaders(train_dataset, valid_dataset): """Traing and validation dataloaders.""" args = get_args() print_rank_0('building train and validation dataloaders ...') # Training dataset. train_dataloader = build_data_loader(train_dataset, args.micro_batch_size, args.num_workers, not args.keep_last) # Set the training iterations. args.train_iters_per_epoch = len(train_dataloader) args.train_iters = args.epochs * args.train_iters_per_epoch # Validation dataset. For this dataset, we do not need to set up # shuffling so we can just use a simple infinite loop. valid_dataloader_ = build_data_loader(valid_dataset, args.micro_batch_size, args.num_workers, not args.keep_last) valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_) # Now that we've built the data loaders, set batch_size arguments # to the actual batch size the model will see for this dataset. # This is necessary so pipeline transfers know what size they are # and the LR schedule, which is based on samples seen, gets set # correctly. if hasattr(train_dataset, 'sample_multiplier'): args.micro_batch_size *= train_dataset.sample_multiplier args.global_batch_size *= train_dataset.sample_multiplier return train_dataloader, valid_dataloader
def _build_train_valid_dataloaders(train_dataset, valid_dataset): """Traing and validation dataloaders.""" args = get_args() print_rank_0('building train and validation dataloaders ...') # Training dataset. train_dataloader = build_data_loader(train_dataset, args.micro_batch_size, args.num_workers, not args.keep_last) # Set the training iterations. args.train_iters_per_epoch = len(train_dataloader) args.train_iters = args.epochs * args.train_iters_per_epoch # Validation dataset. For this dataset, we do not need to set up # shuffling so we can just use a simple infinite loop. valid_dataloader_ = build_data_loader(valid_dataset, args.micro_batch_size, args.num_workers, not args.keep_last) valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_) # Now that we've built the data loaders, set batch_size arguments # to the actual batch size the model will see for this dataset. # This is necessary so pipeline transfers know what size they are # and the LR schedule, which is based on samples seen, gets set # correctly. args.orig_micro_batch_size = args.micro_batch_size args.orig_global_batch_size = args.global_batch_size if hasattr(train_dataset, 'sample_multiplier'): # If our dataset as a sample_multiplier attribute that means # each "sample" from the dataset actually has multiple samples # that will collapse into the batch dimension (for example in # the RACE dataset that has several options), we need to # account for that when setting the micro batch size. args.micro_batch_size *= train_dataset.sample_multiplier args.global_batch_size *= train_dataset.sample_multiplier return train_dataloader, valid_dataloader
def setup_for_inference_or_eval(inference=True, get_key_value=True, overwrite_values=None): from megatron.neox_arguments import NeoXArgs from megatron.initialize import initialize_megatron from megatron.training import setup_model_and_optimizer _overwrite_values = { "checkpoint_activations": False, "partition_activations": False, "no_load_optim": True, } if overwrite_values: _overwrite_values.update(overwrite_values) neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values) neox_args.configure_distributed_args() neox_args.build_tokenizer() if neox_args.load is None: raise ValueError("`load` parameter must be supplied to load a model`") # initialize megatron initialize_megatron(neox_args) # set up model and load checkpoint. model, _, _ = setup_model_and_optimizer( neox_args=neox_args, inference=inference, get_key_value=get_key_value ) # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed print_rank_0('Finished loading model') return model, neox_args
def model_provider(): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_tokentypes=0, parallel_output=False) return model
def train(forward_step_func, model, optimizer, lr_scheduler, train_data_iterator, valid_data_iterator): """Train the model function.""" args = get_args() timers = get_timers() # Turn on training mode which enables dropout. model.train() # Tracking loss. total_loss_dict = {} # Iterations. iteration = args.iteration timers('interval time').start() report_memory_flag = True while iteration < args.train_iters: loss_dict, skipped_iter = train_step(forward_step_func, train_data_iterator, model, optimizer, lr_scheduler) iteration += 1 # Logging. loss_scale = None if args.fp16: loss_scale = optimizer.cur_scale if args.deepspeed else optimizer.loss_scale report_memory_flag = training_log(loss_dict, total_loss_dict, optimizer.param_groups[0]['lr'], iteration, loss_scale, report_memory_flag, skipped_iter) # Autoresume if args.adlr_autoresume and \ (iteration % args.adlr_autoresume_interval == 0): check_adlr_autoresume_termination(iteration, model, optimizer, lr_scheduler) # Checkpointing if args.save and args.save_interval and \ iteration % args.save_interval == 0: save_checkpoint(iteration, model, optimizer, lr_scheduler) # Evaluation if args.eval_interval and iteration % args.eval_interval == 0 and \ args.do_valid: prefix = 'iteration {}'.format(iteration) evaluate_and_print_results(prefix, forward_step_func, valid_data_iterator, model, iteration, False) if args.exit_interval and iteration % args.exit_interval == 0: torch.distributed.barrier() time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') rank = torch.distributed.get_rank() print_rank_0('rank: {} | time: {} | exiting the program at ' 'iteration {}'.format(rank, time_str, iteration)) sys.exit() return iteration
def main(): """Main program.""" args = get_args() if args.num_layers_per_virtual_pipeline_stage is not None: print("Interleaved pipeline schedule is not yet supported for text generation.") exit() if args.task == 'LAMBADA': eval_metric = 'accuracy' elif args.task == 'WIKITEXT103': eval_metric = 'loss' else: raise NotImplementedError('{} task is not implemented.'.format( args.task)) # Set up model and load checkpoint. model = get_model(get_model_provider(eval_metric), wrap_with_ddp=False) if args.load is not None: _ = load_checkpoint(model, None, None) assert len(model) == 1, "Above condition should have caught this" model = model[0] # Data stuff. dataset = build_dataset(args.task) dataloader = build_data_loader(dataset, args.micro_batch_size, args.num_workers, drop_last=False) # Run evaluation. evaluate_and_print_results(args.task, dataloader, model, eval_metric) print_rank_0('done :-)')
def update_train_iters(args): # For iteration-based training, we don't need to do anything if args.train_iters: return # Constant batch size with sample-based training. if args.rampup_batch_size is None: args.train_iters = args.train_samples // args.global_batch_size else: # Sample based training with rampup batch size. iterations = 0 consumed_samples = 0 # Rampup phase. while consumed_samples <= int(args.rampup_batch_size[2]): update_num_microbatches(consumed_samples, consistency_check=False) consumed_samples += get_current_global_batch_size() iterations += 1 # Reset update_num_microbatches(0, consistency_check=False) # Constant phase # Note that we throw away any partial last batch. iterations += (args.train_samples - consumed_samples) // \ args.global_batch_size args.train_iters = iterations print_rank_0('setting training iterations to {}'.format(args.train_iters))
def get_model(neox_args, inference=False, get_key_value=True): """Build the model.""" print_rank_0('building GPT2 model ...') # Build model on cpu. model = GPT2ModelPipe(neox_args=neox_args, num_tokentypes=0, parallel_output=True, topology=mpu.get_topology(), inference=inference, get_key_value=get_key_value) if not neox_args.is_pipe_parallel: # Export PipeParallel model to nn.Sequential model to avoid the overhead of deepspeed's pipe parallel training model = model.to_sequential() else: # This is a hack to give us a reference to get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize model._megatron_batch_fn = partial(get_batch_pipe, neox_args=neox_args) if neox_args.deepspeed: # DeepSpeed handles CUDA, FP16, and DDP components. return model else: raise ValueError("Must be using deepspeed to run neox")
def metrics_func(model, epoch, output_predictions=False): print_rank_0('calculating metrics ...') correct = 0 total = 0 if output_predictions: assert mpu.get_data_parallel_world_size() == 1 named_predictions = [] names = 'predictions' for name, dataloader in dataloaders: output = calculate_correct_answers(name, model, dataloader, epoch, output_predictions) if not output_predictions: correct_ans, total_count = output else: correct_ans, total_count, predictions = output named_predictions.append((name, predictions)) names += '_' + name correct += correct_ans total += total_count percent = float(correct) * 100.0 / float(total) print_rank_0(' >> |epoch: {}| overall: correct / total = {} / {} = ' '{:.4f} %'.format(epoch, correct, total, percent)) if output_predictions and torch.distributed.get_rank() == 0: assert args.load is not None filename = os.path.join(args.load, names + '.pt') torch.save(named_predictions, filename)
def biencoder_model_provider(only_query_model=False, only_context_model=False, biencoder_shared_query_context_model=False, pre_process=True, post_process=True): """Build the model.""" assert mpu.get_tensor_model_parallel_world_size() == 1 and \ mpu.get_pipeline_model_parallel_world_size() == 1, \ "Model parallel size > 1 not supported for ICT" print_rank_0('building BiEncoderModel...') # simpler to just keep using 2 tokentypes since # the LM we initialize with has 2 tokentypes model = BiEncoderModel( num_tokentypes=2, parallel_output=False, only_query_model=only_query_model, only_context_model=only_context_model, biencoder_shared_query_context_model=\ biencoder_shared_query_context_model, pre_process=pre_process, post_process=post_process) return model
def evaluate(forward_step_func, data_iterator, model, verbose=False): """Evaluation.""" args = get_args() # Turn on evaluation mode which disables dropout. model.eval() total_loss_dict = {} with torch.no_grad(): iteration = 0 while iteration < args.eval_iters: iteration += 1 if verbose and iteration % args.log_interval == 0: print_rank_0('Evaluating iter {}/{}'.format(iteration, args.eval_iters)) # Forward evaluation. _, loss_dict = forward_step_func(data_iterator, model) # Reduce across processes. for key in loss_dict: total_loss_dict[key] = total_loss_dict.get(key, 0.) + \ loss_dict[key] # Move model back to the train mode. model.train() for key in total_loss_dict: total_loss_dict[key] /= args.eval_iters return total_loss_dict
def setup_model_and_optimizer(model_provider_func): """Setup model and optimizer.""" args = get_args() model = get_model(model_provider_func) optimizer = get_optimizer(model) lr_scheduler = get_learning_rate_scheduler(optimizer) if args.deepspeed: print_rank_0("DeepSpeed is enabled.") model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, optimizer=optimizer, args=args, lr_scheduler=lr_scheduler, mpu=mpu, dist_init_required=False) if args.load is not None: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) else: args.iteration = 0 # get model without FP16 and/or TorchDDP wrappers unwrapped_model = model while hasattr(unwrapped_model, 'module'): unwrapped_model = unwrapped_model.module if args.iteration == 0 and hasattr(unwrapped_model, 'init_state_dict_from_bert'): print("Initializing ICT from pretrained BERT model", flush=True) unwrapped_model.init_state_dict_from_bert() return model, optimizer, lr_scheduler
def main(): """Main program.""" args = get_args() if args.task == 'LAMBADA': eval_metric = 'accuracy' elif args.task == 'WIKITEXT103': eval_metric = 'loss' else: raise NotImplementedError('{} task is not implemented.'.format( args.task)) # Set up model and load checkpoint. model = get_model(get_model_provider(eval_metric)) if args.load is not None: _ = load_checkpoint(model, None, None) # Data stuff. dataset = build_dataset(args.task) dataloader = build_data_loader(dataset, args.batch_size, args.num_workers, drop_last=False) # Run evaluation. evaluate_and_print_results(args.task, dataloader, model, eval_metric) print_rank_0('done :-)')
def build_the_dataset(data_prefix, name, data_impl, num_samples, seq_length, seed, skip_warmup, build_index_mappings=True): """Build train/valid/test datasets.""" indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup) total_num_of_documents = indexed_dataset.sizes.shape[0] print_rank_0(' {}:'.format(name)) print_rank_0(' no. of documents:{}'.format(total_num_of_documents)) dataset = None documents = np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32) dataset = GPT2Dataset(name, data_prefix, documents, indexed_dataset, num_samples, seq_length, seed, build_index_mappings=build_index_mappings) return dataset
def __init__(self, datasets, weights): self.datasets = datasets num_datasets = len(datasets) assert num_datasets == len(weights) self.size = 0 for dataset in self.datasets: self.size += len(dataset) # Normalize weights. weights = np.array(weights, dtype=np.float64) sum_weights = np.sum(weights) assert sum_weights > 0.0 weights /= sum_weights # Build indecies. start_time = time.time() assert num_datasets < 255 self.dataset_index = np.zeros(self.size, dtype=np.uint8) self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) from megatron.data import helpers helpers.build_blending_indices(self.dataset_index, self.dataset_sample_index, weights, num_datasets, self.size, torch.distributed.get_rank() == 0) print_rank_0('> elapsed time for building blendable dataset indices: ' '{:.2f} (sec)'.format(time.time() - start_time))
def model_provider(): """Build the model.""" if eval_metric == 'loss': parallel_output = True elif eval_metric == 'accuracy': parallel_output = False else: raise NotImplementedError('output type for {} evaluation metric ' 'is not supported.'.format(eval_metric)) print_rank_0('building GPT2 model ...') if mpu.get_pipeline_model_parallel_world_size() > 1: # Determine model based on position of stage in pipeline. if mpu.is_pipeline_first_stage(): model = GPT2ModelFirstStage(num_tokentypes=0) elif mpu.is_pipeline_last_stage(): model = GPT2ModelLastStage(parallel_output=parallel_output, num_tokentypes=0) else: model = GPT2ModelIntermediateStage(num_tokentypes=0) else: model = GPT2Model(num_tokentypes=0, parallel_output=parallel_output) return model
def __init__(self, optimizer, start_lr, warmup_iter, total_iters, decay_style, last_iter, min_lr=0.0, use_checkpoint_lr_scheduler=True, override_lr_scheduler=False): # Class values. self.optimizer = optimizer self.start_lr = start_lr self.min_lr = min_lr self.warmup_iter = warmup_iter self.num_iters = last_iter self.end_iter = total_iters assert self.end_iter > 0 self.decay_style = decay_style self.override_lr_scheduler = override_lr_scheduler self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler if self.override_lr_scheduler: assert not self.use_checkpoint_lr_scheduler, 'both override and '\ 'use-checkpoint are set.' # Set the learning rate self.step(self.num_iters) print_rank_0('> learning rate decay style: {}'.format( self.decay_style))
def setup_model_and_optimizer(model_provider_func): """Setup model and optimizer.""" args = get_args() model = get_model(model_provider_func) optimizer, param_groups = get_optimizer(model) lr_scheduler = get_learning_rate_scheduler(optimizer) if args.deepspeed: print_rank_0("DeepSpeed is enabled.") model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, optimizer=optimizer, args=args, lr_scheduler=lr_scheduler, mpu=mpu if args.pipe_parallel_size == 0 else None, dist_init_required=False, model_parameters=param_groups if optimizer is None else None) if args.pipe_parallel_size > 0: model.set_batch_fn(model.module._megatron_batch_fn) if args.load is not None: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) else: args.iteration = 0 # get model without FP16 and/or TorchDDP wrappers unwrapped_model = model while hasattr(unwrapped_model, 'module'): unwrapped_model = unwrapped_model.module return model, optimizer, lr_scheduler
def get_learning_rate_scheduler(optimizer): """Build the learning rate scheduler.""" args = get_args() if args.deepspeed and args.onebitadam: print_rank_0("WARNING: onebitadam requires the lr scheduler be built by deepspeed - " "Make sure one is added to your deepspeed config") return None # Add linear learning rate scheduler. if args.lr_decay_iters is not None: num_iters = args.lr_decay_iters else: num_iters = args.train_iters num_iters = max(1, num_iters) init_step = 0 warmup_iter = args.warmup * num_iters lr_scheduler = AnnealingLR( optimizer, start_lr=args.lr, warmup_iter=warmup_iter, total_iters=num_iters, decay_style=args.lr_decay_style, last_iter=init_step, min_lr=args.min_lr, use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler, override_lr_scheduler=args.override_lr_scheduler) return lr_scheduler
def generate_samples_unconditional(neox_args, model, number_of_samples: int = 10, output_file=None, eos_token_id: int = None, maximum_tokens: int = 64, recompute: bool = False, temperature: float = 0.0, top_k: int = 0, top_p: float = 0.0): """ Generates samples unconditionially (no prompt) and yields them in a dictionary. neox_args: NeoXArgs with tokenizer, reset_position_ids, reset_attention_mask and eod_mask_loss model: a Megatron model number_of_samples (default 10): number of unconditional samples to be generated output_file: file where generation results are to be stored in jsonl format. no file will be stored if ommitted eos_token_id: end of text token at which completion is terminated, even if max_tokes count has not been reached maximum_tokens: maximum number of tokens to be generated recompute: flag indicating whether a cache is used for already forwarded tokens (true) or whether all tokens are recomputed at every iteration (false) temperature (default 0.0): exponential scaling output distribution ("higher == more risk") top_k (default 0): integer -> integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token. top_p (default 0.0): float -> Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p. note: greedy decoding is used if temperature is 0.0, top_k is 0 and top_p is 0.0 yields: dict containing the following fields: - 'context' (the input) - 'text' (the completion) - 'length' (the length of the completion in number of tokens) - 'finished': - 'message': a messaged associated with the generation procedure, can be a warning or error - 'duration_seconds': duration of the generation in seconds """ print_rank_0('generate_samples_unconditional() generating...') generated_texts = generate_samples_from_prompt( neox_args=neox_args, model=model, text=["" for _ in range(number_of_samples)], eos_token_id=eos_token_id, maximum_tokens=maximum_tokens, recompute=recompute, temperature=temperature, top_k=top_k, top_p=top_p) if is_mp_rank_0(): if output_file is not None: with open(output_file, "w") as f_out: for item in generated_texts: f_out.write(json.dumps(item) + '\n') print_rank_0('generate_samples_unconditional() done') return generated_texts
def training_log(loss_dict, total_loss_dict, learning_rate, iteration, loss_scale, report_memory_flag): """Log training information such as losses, timing, ....""" args = get_args() timers = get_timers() writer = get_tensorboard_writer() # Update losses. for key in loss_dict: total_loss_dict[key] = total_loss_dict.get(key, 0.) + loss_dict[key] # Logging. timers_to_log = [] def add_to_logging(name): if name in timers.timers: timers_to_log.append(name) add_to_logging('forward') add_to_logging('backward') add_to_logging('allreduce') add_to_logging('optimizer') add_to_logging('batch generator') # Tensorboard values. if writer and torch.distributed.get_rank() == 0: writer.add_scalar('learning_rate', learning_rate, iteration) for key in loss_dict: writer.add_scalar(key, loss_dict[key], iteration) if args.fp16: writer.add_scalar('loss_scale', loss_scale, iteration) normalizer = iteration % args.log_interval if normalizer == 0: normalizer = args.log_interval timers.write(timers_to_log, writer, iteration, normalizer=normalizer) if iteration % args.log_interval == 0: elapsed_time = timers('interval time').elapsed() if writer and torch.distributed.get_rank() == 0: writer.add_scalar('iteration_time', elapsed_time / args.log_interval, iteration) log_string = ' iteration {:8d}/{:8d} |'.format(iteration, args.train_iters) log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( elapsed_time * 1000.0 / args.log_interval) log_string += ' learning rate: {:.3E} |'.format(learning_rate) for key in total_loss_dict: avg = total_loss_dict[key].item() / args.log_interval log_string += ' {}: {:.6E} |'.format(key, avg) total_loss_dict[key] = 0.0 if args.fp16: log_string += ' loss scale: {:.1f} |'.format(loss_scale) print_rank_0(log_string) if report_memory_flag: report_memory('after {} iterations'.format(iteration)) report_memory_flag = False timers.log(timers_to_log, normalizer=args.log_interval) return report_memory_flag
def get_optimizer(model, neox_args): """Set up the optimizer.""" if neox_args.no_load_optim: return None, None # Build parameter groups (weight decay and non-decay). param_groups = get_params_for_weight_decay_optimization(model, neox_args) print_rank_0( f'Configuring Optimizer type: {neox_args.optimizer_type} with params: {neox_args.optimizer["params"]}' ) # Add model parallel attribute if it is not set. for param_group in param_groups: for param in param_group['params']: if not hasattr(param, 'model_parallel'): param.model_parallel = False if neox_args.optimizer_type.lower() in ["cpu_adam", "cpu_torch_adam"]: if neox_args.optimizer == "cpu_torch_adam": cpu_adam_optimizer = torch.optim.Adam else: from deepspeed.ops.adam import DeepSpeedCPUAdam cpu_adam_optimizer = DeepSpeedCPUAdam optimizer = cpu_adam_optimizer(param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"]) elif neox_args.optimizer_type.lower() == "onebitadam": assert neox_args.deepspeed optimizer = None # onebitadam needs to be instantiated within the deepspeed engine to work :| elif neox_args.optimizer_type.lower() == "sm3": from .optimizers import SM3 optimizer = SM3(param_groups, **neox_args.optimizer["params"]) elif neox_args.optimizer_type.lower() == "madgrad_wd": from .optimizers import madgrad_wd optimizer = madgrad_wd(param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"]) elif neox_args.optimizer_type.lower() == "adam": # Use Adam try: # default to apex as it's slightly faster from apex.optimizers import FusedAdam as Adam except ImportError: # if apex isn't installed, use deepspeed's FusedAdam print( "WARNING: APEX not installed - defaulting to deepspeed's fused adam" ) from deepspeed.ops.adam import FusedAdam as Adam optimizer = Adam(param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"]) else: raise ValueError( f"Optimizer type {neox_args.optimizer_type} not recognized") if neox_args.deepspeed: # fp16 wrapper is not required for DeepSpeed. return optimizer, param_groups else: raise ValueError("Must be using deepspeed to run neox")
def model_provider(): """Build the model.""" print_rank_0("building VIT model ...") args = get_args() model = VitModel(num_classes=args.num_classes) return model