def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() device = utils.set_device(args.cuda, args.local_rank) utils.init_distributed(args.cuda) args.rank = utils.get_rank() utils.setup_logging() if args.env: utils.log_env_info() logging.info(f'Run arguments: {args}') if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if not args.cudnn: torch.backends.cudnn.enabled = False # load checkpoint and deserialize to CPU (to save GPU memory) checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) # build GNMT model tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) model_config = checkpoint['model_config'] model_config['batch_first'] = args.batch_first model_config['vocab_size'] = tokenizer.vocab_size model = GNMT(**model_config) model.load_state_dict(checkpoint['state_dict']) # construct the dataset if args.input: data = RawTextDataset( raw_datafile=args.input, tokenizer=tokenizer, sort=args.sort, ) elif args.input_text: data = RawTextDataset( raw_data=args.input_text, tokenizer=tokenizer, sort=args.sort, ) latency_table = tables.LatencyTable(args.percentiles) throughput_table = tables.ThroughputTable(args.percentiles) accuracy_table = tables.AccuracyTable('BLEU') dtype = {'fp32': torch.FloatTensor, 'fp16': torch.HalfTensor} for (math, batch_size, beam_size) in product(args.math, args.batch_size, args.beam_size): logging.info(f'math: {math}, batch size: {batch_size}, ' f'beam size: {beam_size}') model.type(dtype[math]) model = model.to(device) model.eval() # build the data loader loader = data.get_loader( batch_size=batch_size, batch_first=args.batch_first, pad=True, repeat=args.repeat[batch_size], num_workers=0, ) # build the translator object translator = Translator( model=model, tokenizer=tokenizer, loader=loader, beam_size=beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, print_freq=args.print_freq, ) # execute the inference output, stats = translator.run( calc_bleu=args.bleu, eval_path=args.output, summary=True, warmup=args.warmup, reference_path=args.reference, ) # print translated outputs if not args.output and args.rank == 0: logging.info(f'Translated output:') for out in output: print(out) key = (batch_size, beam_size) latency_table.add(key, {math: stats['runtimes']}) throughput_table.add(key, {math: stats['throughputs']}) accuracy_table.add(key, {math: stats['bleu']}) if args.tables: accuracy_table.write('Inference accuracy', args.math) if 'fp16' in args.math and 'fp32' in args.math: relative = 'fp32' else: relative = None if 'fp32' in args.math: throughput_table.write('Inference throughput', 'fp32') if 'fp16' in args.math: throughput_table.write('Inference throughput', 'fp16', relative=relative) if 'fp32' in args.math: latency_table.write('Inference latency', 'fp32') if 'fp16' in args.math: latency_table.write('Inference latency', 'fp16', relative=relative, reverse_speedup=True) passed = utils.benchmark(stats['bleu'], args.target_bleu, stats['tokens_per_sec'], args.target_perf) return passed
def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() if args.affinity != 'disabled': nproc_per_node = torch.cuda.device_count() affinity = gpu_affinity.set_affinity(args.local_rank, nproc_per_node, args.affinity) print(f'{args.local_rank}: thread affinity: {affinity}') device = utils.set_device(args.cuda, args.local_rank) utils.init_distributed(args.cuda) args.rank = utils.get_rank() os.makedirs(args.save_dir, exist_ok=True) utils.setup_logging() dllog_file = os.path.join(args.save_dir, args.dllog_file) utils.setup_dllogger(enabled=True, filename=dllog_file) if args.profile: try: pyprof.init(enable_function_stack=True) except NameError: warnings.warn('Called pyprof.init() but pyprof is not available') if args.env: utils.log_env_info() logging.info(f'Run arguments: {args}') dllogger.log(step='PARAMETER', data=vars(args)) if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if not args.cudnn: torch.backends.cudnn.enabled = False # load checkpoint and deserialize to CPU (to save GPU memory) if args.model: checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) # build GNMT model tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) model_config = checkpoint['model_config'] model_config['batch_first'] = args.batch_first model_config['vocab_size'] = tokenizer.vocab_size model = GNMT(**model_config) model.load_state_dict(checkpoint['state_dict']) elif args.synthetic: model = GNMT(args.synthetic_vocab, batch_first=args.batch_first) tokenizer = None else: raise RuntimeError( 'Specify model either with --synthetic or with --model flag') # construct the dataset if args.input: data = RawTextDataset( raw_datafile=args.input, tokenizer=tokenizer, sort=args.sort, ) elif args.input_text: data = RawTextDataset( raw_data=args.input_text, tokenizer=tokenizer, sort=args.sort, ) elif args.synthetic: data = SyntheticDataset(args.synthetic_vocab, args.synthetic_len, args.batch_size[0] * args.synthetic_batches) latency_table = tables.LatencyTable(args.percentiles) throughput_table = tables.ThroughputTable(args.percentiles) accuracy_table = tables.AccuracyTable('BLEU') dtype = { 'fp32': torch.FloatTensor, 'tf32': torch.FloatTensor, 'fp16': torch.HalfTensor } for (math, batch_size, beam_size) in product(args.math, args.batch_size, args.beam_size): logging.info(f'math: {math}, batch size: {batch_size}, ' f'beam size: {beam_size}') model.type(dtype[math]) model = model.to(device) model.eval() # build the data loader loader = data.get_loader( batch_size=batch_size, batch_first=args.batch_first, pad=True, repeat=args.repeat[batch_size], num_workers=0, ) # build the translator object translator = Translator( model=model, tokenizer=tokenizer, loader=loader, beam_size=beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, print_freq=args.print_freq, ) # execute the inference with torch.autograd.profiler.emit_nvtx(enabled=args.profile): output, stats = translator.run( calc_bleu=args.bleu, eval_path=args.output, summary=True, warmup=args.warmup, reference_path=args.reference, ) # print translated outputs if not args.synthetic and (not args.output and args.rank == 0): logging.info(f'Translated output:') for out in output: print(out) key = (batch_size, beam_size) latency_table.add(key, {math: stats['runtimes']}) throughput_table.add(key, {math: stats['throughputs']}) accuracy_table.add(key, {math: stats['bleu']}) if args.tables: accuracy_table.write('Inference accuracy', args.math) if 'fp16' in args.math and 'fp32' in args.math: relative = 'fp32' elif 'fp16' in args.math and 'tf32' in args.math: relative = 'tf32' else: relative = None if 'fp32' in args.math: throughput_table.write('Inference throughput', 'fp32') if 'tf32' in args.math: throughput_table.write('Inference throughput', 'tf32') if 'fp16' in args.math: throughput_table.write('Inference throughput', 'fp16', relative=relative) if 'fp32' in args.math: latency_table.write('Inference latency', 'fp32') if 'tf32' in args.math: latency_table.write('Inference latency', 'tf32') if 'fp16' in args.math: latency_table.write('Inference latency', 'fp16', relative=relative, reverse_speedup=True) avg_throughput = np.array(stats['throughputs']).mean() avg_latency = np.array(stats['runtimes']).mean() summary = { 'eval_throughput': avg_throughput, 'eval_bleu': stats['bleu'], 'eval_avg_latency': avg_latency, } for p in args.percentiles: summary[f'eval_{p}%_latency'] = np.percentile(stats['runtimes'], p) dllogger.log(step=tuple(), data=summary) passed = utils.benchmark(stats['bleu'], args.target_bleu, stats['tokens_per_sec'], args.target_perf) return passed