def evaluate_and_print_results(prefix, forward_step_func, data_iterator, model, iteration, verbose=False): """Helper function to evaluate and dump results on screen.""" args = get_args() writer = get_tensorboard_writer() total_loss_dict = evaluate(forward_step_func, data_iterator, model, verbose) string = ' validation loss at {} | '.format(prefix) for key in total_loss_dict: string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item()) ppl = math.exp(min(20, total_loss_dict[key].item())) string += '{} PPL: {:.6E} | '.format(key, ppl) if writer and is_last_rank(): writer.add_scalar('{} value-validation'.format(key), total_loss_dict[key].item(), iteration) writer.add_scalar('{} ppl-validation'.format(key), ppl, iteration) writer.add_scalar('{} value-validation vs samples'.format(key), total_loss_dict[key].item(), args.consumed_train_samples) writer.add_scalar('{} ppl-validation vs samples'.format(key), ppl, args.consumed_train_samples) length = len(string) + 1 print_rank_last('-' * length) print_rank_last(string) print_rank_last('-' * length)
def _write_args_to_tensorboard(): """Write arguments to tensorboard.""" args = get_args() writer = get_tensorboard_writer() if writer: for arg in vars(args): writer.add_text(arg, str(getattr(args, arg)))
def evaluate_and_print_results(prefix, forward_step_func, data_iterator, model, iteration, verbose=False): """Helper function to evaluate and dump results on screen.""" writer = get_tensorboard_writer() total_loss_dict = evaluate(forward_step_func, data_iterator, model, verbose) string = ' validation loss at {} | '.format(prefix) for key in total_loss_dict: string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item()) ppl = math.exp(min(20, total_loss_dict[key].item())) string += '{} PPL: {:.6E} | '.format(key, ppl) if writer and torch.distributed.get_rank() == 0: writer.add_scalar('{} value'.format(key), total_loss_dict[key].item(), iteration) writer.add_scalar('{} ppl'.format(key), ppl, iteration) length = len(string) + 1 print_rank_0('-' * length) print_rank_0(string) print_rank_0('-' * length)
def evaluate_and_print_results(prefix, forward_step_func, data_iterator, model, iteration, verbose=False): """Helper function to evaluate and dump results on screen.""" writer = get_tensorboard_writer() # Pipeline parallelism needs eval_batch() instead of a simple forward(). args = get_args() if args.pipe_parallel_size > 0: def _eval_helper(data_iter, pipe_model): loss = model.eval_batch(data_iter) return None, {'lm loss': loss} forward_step_func = _eval_helper total_loss_dict = evaluate(forward_step_func, data_iterator, model, verbose) string = ' validation loss at {} | '.format(prefix) for key in total_loss_dict: string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item()) ppl = math.exp(min(20, total_loss_dict[key].item())) string += '{} PPL: {:.6E} | '.format(key, ppl) if writer and torch.distributed.get_rank() == 0: writer.add_scalar('{} value'.format(key), total_loss_dict[key].item(), iteration) writer.add_scalar('{} ppl'.format(key), ppl, iteration) length = len(string) + 1 print_rank_0('-' * length) print_rank_0(string) print_rank_0('-' * length)
def training_log(loss_dict, total_loss_dict, learning_rate, iteration, loss_scale, report_memory_flag): """Log training information such as losses, timing, ....""" args = get_args() timers = get_timers() writer = get_tensorboard_writer() # Update losses. for key in loss_dict: total_loss_dict[key] = total_loss_dict.get(key, 0.) + loss_dict[key] # Logging. timers_to_log = [] def add_to_logging(name): if name in timers.timers: timers_to_log.append(name) add_to_logging('forward') add_to_logging('backward') add_to_logging('allreduce') add_to_logging('optimizer') add_to_logging('batch generator') # Tensorboard values. if writer and torch.distributed.get_rank() == 0: writer.add_scalar('learning_rate', learning_rate, iteration) for key in loss_dict: writer.add_scalar(key, loss_dict[key], iteration) if args.fp16: writer.add_scalar('loss_scale', loss_scale, iteration) normalizer = iteration % args.log_interval if normalizer == 0: normalizer = args.log_interval timers.write(timers_to_log, writer, iteration, normalizer=normalizer) if iteration % args.log_interval == 0: elapsed_time = timers('interval time').elapsed() if writer and torch.distributed.get_rank() == 0: writer.add_scalar('iteration_time', elapsed_time / args.log_interval, iteration) log_string = ' iteration {:8d}/{:8d} |'.format(iteration, args.train_iters) log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( elapsed_time * 1000.0 / args.log_interval) log_string += ' learning rate: {:.3E} |'.format(learning_rate) for key in total_loss_dict: avg = total_loss_dict[key].item() / args.log_interval log_string += ' {}: {:.6E} |'.format(key, avg) total_loss_dict[key] = 0.0 if args.fp16: log_string += ' loss scale: {:.1f} |'.format(loss_scale) print_rank_0(log_string) if report_memory_flag: report_memory('after {} iterations'.format(iteration)) report_memory_flag = False timers.log(timers_to_log, normalizer=args.log_interval) return report_memory_flag
def training_log(loss_dict, total_loss_dict, learning_rate, iteration, loss_scale, report_memory_flag, skipped_iter): """Log training information such as losses, timing, ....""" args = get_args() timers = get_timers() writer = get_tensorboard_writer() # Update losses. skipped_iters_key = 'skipped iterations' total_loss_dict[skipped_iters_key] = total_loss_dict.get( skipped_iters_key, 0) + skipped_iter got_nan_key = 'got nan' got_nan = False for key in loss_dict: if not skipped_iter: total_loss_dict[key] = total_loss_dict.get(key, 0.) + loss_dict[key] else: value = loss_dict[key].float().sum().item() is_nan = value == float('inf') or \ value == -float('inf') or \ value != value got_nan = got_nan or is_nan total_loss_dict[got_nan_key] = total_loss_dict.get( got_nan_key, 0) + int(got_nan) # Logging. timers_to_log = [] def add_to_logging(name): if name in timers.timers: timers_to_log.append(name) add_to_logging('forward') add_to_logging('VocabParallelEmbedding forward reduce') add_to_logging('ColumnParallelLinear forward gather') add_to_logging('RowParallelLinear forward reduce') add_to_logging('backward') add_to_logging('backward-backward') add_to_logging('backward-allreduce') add_to_logging('backward-master-grad') add_to_logging('backward-clip-grad') add_to_logging('optimizer') add_to_logging('batch generator') add_to_logging('_reduce inside') add_to_logging('_gather inside') add_to_logging('CopyToModelParallelRegion BACKWARD _reduce') add_to_logging('ReduceFromModelParallelRegion SYMBOLIC _reduce') add_to_logging('ReduceFromModelParallelRegion FORWARD _reduce') add_to_logging('ScatterToModelParallelRegion BACKWARD _gather') add_to_logging('GatherFromModelParallelRegion SYMBOLIC _gather') add_to_logging('GatherFromModelParallelRegion FORWARD _gather') # Tensorboard values. if writer and torch.distributed.get_rank() == 0: writer.add_scalar('learning_rate', learning_rate, iteration) for key in loss_dict: writer.add_scalar(key, loss_dict[key], iteration) if args.fp16: writer.add_scalar('loss_scale', loss_scale, iteration) normalizer = iteration % args.log_interval if normalizer == 0: normalizer = args.log_interval timers.write(timers_to_log, writer, iteration, normalizer=normalizer) if iteration % args.log_interval == 0: elapsed_time = timers('interval time').elapsed() if writer and torch.distributed.get_rank() == 0: writer.add_scalar('iteration_time', elapsed_time / args.log_interval, iteration) log_string = ' iteration {:8d}/{:8d} |'.format(iteration, args.train_iters) log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( elapsed_time * 1000.0 / args.log_interval) log_string += ' learning rate: {:.3E} |'.format(learning_rate) num_iterations = max( 1, args.log_interval - total_loss_dict[skipped_iters_key]) for key in total_loss_dict: if key not in [skipped_iters_key, got_nan_key]: avg = total_loss_dict[key].item() / float(num_iterations) log_string += ' {}: {:.6E} |'.format(key, avg) total_loss_dict[key] = 0.0 if args.fp16: log_string += ' loss scale: {:.1f} |'.format(loss_scale) log_string += ' number of skipped iterations: {:3d} |'.format( total_loss_dict[skipped_iters_key]) log_string += ' number of nan iterations: {:3d} |'.format( total_loss_dict[got_nan_key]) total_loss_dict[skipped_iters_key] = 0 total_loss_dict[got_nan_key] = 0 print_rank_0(log_string) if report_memory_flag: report_memory('after {} iterations'.format(iteration)) report_memory_flag = False timers.log(timers_to_log, normalizer=args.log_interval) return report_memory_flag
def training_log(loss_dict, total_loss_dict, learning_rate, iteration, loss_scale, report_memory_flag, skipped_iter, model=None): """Log training information such as losses, timing, ....""" args = get_args() timers = get_timers() writer = get_tensorboard_writer() # Update losses. skipped_iters_key = 'skipped iterations' total_loss_dict[skipped_iters_key] = total_loss_dict.get( skipped_iters_key, 0) + skipped_iter got_nan_key = 'got nan' got_nan = False for key in loss_dict: if not skipped_iter: total_loss_dict[key] = total_loss_dict.get(key, 0.) + loss_dict[key] else: value = loss_dict[key].float().sum().item() is_nan = value == float('inf') or \ value == -float('inf') or \ value != value got_nan = got_nan or is_nan total_loss_dict[got_nan_key] = total_loss_dict.get( got_nan_key, 0) + int(got_nan) # Logging. timers_to_log = [] def add_to_logging(name): if name in timers.timers: timers_to_log.append(name) add_to_logging('forward') add_to_logging('backward') add_to_logging('backward-backward') add_to_logging('backward-allreduce') add_to_logging('backward-master-grad') add_to_logging('backward-clip-grad') add_to_logging('optimizer') add_to_logging('batch generator') # Tensorboard values. if writer and torch.distributed.get_rank() == 0: writer.add_scalar('tokens', args.tokens, iteration) writer.add_scalar('learning_rate', learning_rate, iteration) writer.add_scalar('learning_rate/vs tokens', learning_rate, args.tokens) if args.curriculum_learning: writer.add_scalar('seqlen', args.curriculum_seqlen, iteration) writer.add_scalar('seqlen/vs tokens', args.curriculum_seqlen, args.tokens) for key in loss_dict: writer.add_scalar(key, loss_dict[key], iteration) writer.add_scalar(key + '/vs tokens', loss_dict[key], args.tokens) if args.fp16: writer.add_scalar('loss_scale', loss_scale, iteration) normalizer = iteration % args.log_interval if normalizer == 0: normalizer = args.log_interval timers.write(timers_to_log, writer, iteration, normalizer=normalizer) if iteration % args.log_interval == 0: elapsed_time = timers('interval time').elapsed() if writer and torch.distributed.get_rank() == 0: writer.add_scalar('iteration_time', elapsed_time / args.log_interval, iteration) log_string = ' iteration {:8d}/{:8d} |'.format(iteration, args.train_iters) log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( elapsed_time * 1000.0 / args.log_interval) log_string += ' learning rate: {:.3E} |'.format(learning_rate) num_iterations = max( 1, args.log_interval - total_loss_dict[skipped_iters_key]) for key in total_loss_dict: if key not in [skipped_iters_key, got_nan_key]: avg = total_loss_dict[key].item() / float(num_iterations) log_string += ' {}: {:.6E} |'.format(key, avg) total_loss_dict[key] = 0.0 if args.fp16: log_string += ' loss scale: {:.1f} |'.format(loss_scale) log_string += ' number of skipped iterations: {:3d} |'.format( total_loss_dict[skipped_iters_key]) log_string += ' number of nan iterations: {:3d} |'.format( total_loss_dict[got_nan_key]) total_loss_dict[skipped_iters_key] = 0 total_loss_dict[got_nan_key] = 0 print_rank_0(log_string) if report_memory_flag: report_memory('after {} iterations'.format(iteration)) report_memory_flag = False timers.log(timers_to_log, normalizer=args.log_interval) flops_calculator(model, args, elapsed_time) return report_memory_flag
def training_log(loss_dict, total_loss_dict, learning_rate, iteration, loss_scale, report_memory_flag, skipped_iter): """Log training information such as losses, timing, ....""" args = get_args() timers = get_timers() writer = get_tensorboard_writer() # Advanced, skipped, and Nan iterations. advanced_iters_key = 'advanced iterations' skipped_iters_key = 'skipped iterations' nan_iters_key = 'nan iterations' # Advanced iterations. if not skipped_iter: total_loss_dict[advanced_iters_key] = total_loss_dict.get( advanced_iters_key, 0) + 1 else: if advanced_iters_key not in total_loss_dict: total_loss_dict[advanced_iters_key] = 0 # Skipped iterations. total_loss_dict[skipped_iters_key] = total_loss_dict.get( skipped_iters_key, 0) + skipped_iter # Update losses and set nan iterations got_nan = False for key in loss_dict: if not skipped_iter: total_loss_dict[key] = total_loss_dict.get( key, torch.cuda.FloatTensor([0.0])) + loss_dict[key] else: value = loss_dict[key].float().sum().item() is_nan = value == float('inf') or \ value == -float('inf') or \ value != value got_nan = got_nan or is_nan total_loss_dict[nan_iters_key] = total_loss_dict.get(nan_iters_key, 0) + int(got_nan) # Logging. timers_to_log = [] def add_to_logging(name): if name in timers.timers: timers_to_log.append(name) add_to_logging('forward-compute') add_to_logging('forward-recv') add_to_logging('forward-send') add_to_logging('forward-send-backward-recv') add_to_logging('backward-compute') add_to_logging('backward-recv') add_to_logging('backward-send') add_to_logging('backward-send-forward-recv') add_to_logging('backward-params-all-reduce') add_to_logging('backward-embedding-all-reduce') add_to_logging('optimizer-copy-to-main-grad') add_to_logging('optimizer-unscale-and-check-inf') add_to_logging('optimizer-clip-main-grad') add_to_logging('optimizer-copy-main-to-model-params') add_to_logging('optimizer') add_to_logging('batch-generator') # Calculate batch size. batch_size = args.micro_batch_size * args.data_parallel_size * \ get_num_microbatches() total_iterations = total_loss_dict[advanced_iters_key] + \ total_loss_dict[skipped_iters_key] # Tensorboard values. if writer and is_last_rank(): writer.add_scalar('learning-rate', learning_rate, iteration) writer.add_scalar('learning-rate vs samples', learning_rate, args.consumed_train_samples) writer.add_scalar('batch-size', batch_size, iteration) writer.add_scalar('batch-size vs samples', batch_size, args.consumed_train_samples) for key in loss_dict: writer.add_scalar(key, loss_dict[key], iteration) writer.add_scalar(key + ' vs samples', loss_dict[key], args.consumed_train_samples) writer.add_scalar('loss-scale', loss_scale, iteration) writer.add_scalar('loss-scale vs samples', loss_scale, args.consumed_train_samples) timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) if iteration % args.log_interval == 0: elapsed_time = timers('interval time').elapsed() elapsed_time_per_iteration = elapsed_time / total_iterations if writer and torch.distributed.get_rank() == 0: writer.add_scalar('iteration-time', elapsed_time_per_iteration, iteration) log_string = ' iteration {:8d}/{:8d} |'.format(iteration, args.train_iters) log_string += ' consumed samples: {:12d} |'.format( args.consumed_train_samples) log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( elapsed_time_per_iteration * 1000.0) log_string += ' learning rate: {:.3E} |'.format(learning_rate) log_string += ' global batch size: {:5d} |'.format(batch_size) for key in total_loss_dict: if key not in [ advanced_iters_key, skipped_iters_key, nan_iters_key ]: avg = total_loss_dict[key].item() / \ float(max(1, total_loss_dict[advanced_iters_key])) if avg > 0.0: log_string += ' {}: {:.6E} |'.format(key, avg) total_loss_dict[key] = torch.cuda.FloatTensor([0.0]) log_string += ' loss scale: {:.1f} |'.format(loss_scale) log_string += ' number of skipped iterations: {:3d} |'.format( total_loss_dict[skipped_iters_key]) log_string += ' number of nan iterations: {:3d} |'.format( total_loss_dict[nan_iters_key]) total_loss_dict[advanced_iters_key] = 0 total_loss_dict[skipped_iters_key] = 0 total_loss_dict[nan_iters_key] = 0 print_rank_last(log_string) if report_memory_flag and learning_rate > 0.: # Report memory after optimizer state has been initialized. report_memory('(after {} iterations)'.format(iteration)) report_memory_flag = False timers.log(timers_to_log, normalizer=args.log_interval) return report_memory_flag
def training_log(loss_dict, total_loss_dict, learning_rate, iteration, loss_scale, report_memory_flag, skipped_iter, model): """Log training information such as losses, timing, ....""" args = get_args() timers = get_timers() writer = get_tensorboard_writer() # Update losses. skipped_iters_key = 'skipped iterations' total_loss_dict[skipped_iters_key] = total_loss_dict.get( skipped_iters_key, 0) + skipped_iter got_nan_key = 'got nan' got_nan = False for key in loss_dict: if not skipped_iter: total_loss_dict[key] = total_loss_dict.get(key, 0.) + loss_dict[key] else: value = loss_dict[key].float().sum().item() is_nan = value == float('inf') or \ value == -float('inf') or \ value != value got_nan = got_nan or is_nan total_loss_dict[got_nan_key] = total_loss_dict.get(got_nan_key, 0) + int(got_nan) # Logging. timers_to_log = [] def add_to_logging(name): if name in timers.timers: timers_to_log.append(name) if args.pipe_parallel_size <= 0: add_to_logging('forward') add_to_logging('backward') add_to_logging('backward-backward') add_to_logging('backward-allreduce') add_to_logging('backward-master-grad') add_to_logging('backward-clip-grad') add_to_logging('optimizer') add_to_logging('batch generator') else: # with pipeline parallel, the megatron timers are overridden by the deepspeed ones. # Try to grab timer values from model engine. Only recently added to deeperspeed, so check that the engine # has that attribute first if hasattr(model, 'timer_values') and model.timer_values is not None: if model.wall_clock_breakdown( ) and model.global_steps % model.steps_per_print() == 0: timer_values = model.timer_values # deepspeed already logs to tensorboard / prints values, so just log to wandb if get_use_wandb() and torch.distributed.get_rank() == 0: for key in timer_values: wandb.log({key: timer_values[key]}, step=iteration) # Log timer info to tensorboard and wandb normalizer = iteration % args.log_interval if normalizer == 0: normalizer = args.log_interval if torch.distributed.get_rank() == 0: timers.write(names=timers_to_log, iteration=iteration, normalizer=normalizer) # wandb writer if get_use_wandb() and torch.distributed.get_rank() == 0: wandb.log({'learning_rate': learning_rate}, step=iteration) for key in loss_dict: wandb.log({key: loss_dict[key]}, step=iteration) if args.fp16: wandb.log({'loss_scale': loss_scale}, step=iteration) # Tensorboard values. if writer and torch.distributed.get_rank() == 0: writer.add_scalar('learning_rate', learning_rate, iteration) for key in loss_dict: writer.add_scalar(key, loss_dict[key], iteration) if args.fp16: writer.add_scalar('loss_scale', loss_scale, iteration) if iteration % args.log_interval == 0: elapsed_time = timers('interval time').elapsed() iteration_time = elapsed_time / args.log_interval samples_per_sec = get_global_batch_size(args) / iteration_time log_string = ' samples/sec: {:.3f} |'.format(samples_per_sec) if writer and torch.distributed.get_rank() == 0: writer.add_scalar('samples/sec', samples_per_sec, iteration) writer.add_scalar('iteration_time', iteration_time, iteration) if get_use_wandb() and torch.distributed.get_rank() == 0: wandb.log({'samples/sec': samples_per_sec}, step=iteration) wandb.log({'iteration_time': iteration_time}, step=iteration) log_string += ' iteration {:8d}/{:8d} |'.format( iteration, args.train_iters) log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( elapsed_time * 1000.0 / args.log_interval) log_string += ' learning rate: {:.3E} |'.format(learning_rate) num_iterations = max( 1, args.log_interval - total_loss_dict[skipped_iters_key]) # calculate tflop / gpu flops_per_s_per_gpu = get_flops(model, iteration_time) log_string += f' approx flops per GPU: {human_readable_flops(flops_per_s_per_gpu)} |' if writer and torch.distributed.get_rank() == 0: writer.add_scalar('flops/s/gpu', flops_per_s_per_gpu, iteration) if get_use_wandb() and torch.distributed.get_rank() == 0: wandb.log({'flops/s/gpu': flops_per_s_per_gpu}, step=iteration) for key in total_loss_dict: if key not in [skipped_iters_key, got_nan_key]: v = total_loss_dict[key].item() if hasattr( total_loss_dict[key], 'item') else total_loss_dict[key] avg = v / float(num_iterations) log_string += ' {}: {:.6E} |'.format(key, avg) total_loss_dict[key] = 0.0 if args.fp16: log_string += ' loss scale: {:.1f} |'.format(loss_scale) log_string += ' number of skipped iterations: {:3d} |'.format( total_loss_dict[skipped_iters_key]) log_string += ' number of nan iterations: {:3d} |'.format( total_loss_dict[got_nan_key]) total_loss_dict[skipped_iters_key] = 0 total_loss_dict[got_nan_key] = 0 print_rank_0(log_string) if report_memory_flag: report_memory('after {} iterations'.format(iteration)) report_memory_flag = False timers.log(timers_to_log, normalizer=args.log_interval) return report_memory_flag