def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() device = utils.set_device(args.cuda, args.local_rank) utils.init_distributed(args.cuda) args.rank = utils.get_rank() utils.setup_logging() if args.env: utils.log_env_info() logging.info(f'Run arguments: {args}') if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if not args.cudnn: torch.backends.cudnn.enabled = False # load checkpoint and deserialize to CPU (to save GPU memory) checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) # build GNMT model tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) model_config = checkpoint['model_config'] model_config['batch_first'] = args.batch_first model_config['vocab_size'] = tokenizer.vocab_size model = GNMT(**model_config) model.load_state_dict(checkpoint['state_dict']) # construct the dataset if args.input: data = RawTextDataset( raw_datafile=args.input, tokenizer=tokenizer, sort=args.sort, ) elif args.input_text: data = RawTextDataset( raw_data=args.input_text, tokenizer=tokenizer, sort=args.sort, ) latency_table = tables.LatencyTable(args.percentiles) throughput_table = tables.ThroughputTable(args.percentiles) accuracy_table = tables.AccuracyTable('BLEU') dtype = {'fp32': torch.FloatTensor, 'fp16': torch.HalfTensor} for (math, batch_size, beam_size) in product(args.math, args.batch_size, args.beam_size): logging.info(f'math: {math}, batch size: {batch_size}, ' f'beam size: {beam_size}') model.type(dtype[math]) model = model.to(device) model.eval() # build the data loader loader = data.get_loader( batch_size=batch_size, batch_first=args.batch_first, pad=True, repeat=args.repeat[batch_size], num_workers=0, ) # build the translator object translator = Translator( model=model, tokenizer=tokenizer, loader=loader, beam_size=beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, print_freq=args.print_freq, ) # execute the inference output, stats = translator.run( calc_bleu=args.bleu, eval_path=args.output, summary=True, warmup=args.warmup, reference_path=args.reference, ) # print translated outputs if not args.output and args.rank == 0: logging.info(f'Translated output:') for out in output: print(out) key = (batch_size, beam_size) latency_table.add(key, {math: stats['runtimes']}) throughput_table.add(key, {math: stats['throughputs']}) accuracy_table.add(key, {math: stats['bleu']}) if args.tables: accuracy_table.write('Inference accuracy', args.math) if 'fp16' in args.math and 'fp32' in args.math: relative = 'fp32' else: relative = None if 'fp32' in args.math: throughput_table.write('Inference throughput', 'fp32') if 'fp16' in args.math: throughput_table.write('Inference throughput', 'fp16', relative=relative) if 'fp32' in args.math: latency_table.write('Inference latency', 'fp32') if 'fp16' in args.math: latency_table.write('Inference latency', 'fp16', relative=relative, reverse_speedup=True) passed = utils.benchmark(stats['bleu'], args.target_bleu, stats['tokens_per_sec'], args.target_perf) return passed
def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() args.batch_first = False if args.cuda: torch.cuda.set_device(0) if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if args.math == 'fp16' and not args.cuda: raise RuntimeError('fp16 requires cuda') if not args.cudnn: torch.backends.cudnn.enabled = False num_stages = args.num_stages # compute BLEU score for every epoch print("Epoch\tBLEU score") epoch = 0 while True: # no more epochs to run, since desired file not available if not os.path.isfile( os.path.join(args.checkpoint_path, f"checkpoint.0.pth.tar.epoch.{epoch}")): break module = importlib.import_module(args.module) model = module.model(None) num_modules = len(model) key_to_module_mapping = OrderedDict() all_stages_state_dict = OrderedDict() module_id = 0 stage_id = 0 for stage_id in range(num_stages): # load the checkpoint associated with a stage full_checkpoint_path = os.path.join( args.checkpoint_path, f"checkpoint.{stage_id}.pth.tar.epoch.{epoch}") checkpoint = torch.load(full_checkpoint_path, map_location=torch.device('cpu')) # iterate through all modules in stage_id's checkpoint local_module_id = 0 # quit when checkpoints for all modules in full model are loaded while module_id < num_modules: # load checkpoint corresponding to different modules in our runtime state_dict = checkpoint["state_dict"] state_dict_key = "module%d" % local_module_id if state_dict_key not in state_dict: break state_dict = checkpoint["state_dict"][state_dict_key] # remove mask buffer keys_to_delete = [] for key in state_dict: if "mask" in key: keys_to_delete.append(key) for key in keys_to_delete: del state_dict[key] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) # collect all state_dicts in a single OrderedDict for key in state_dict: all_stages_state_dict[(stage_id, local_module_id, key)] = state_dict[key] stage_module, _, _ = model[module_id] for key in state_dict: # key_to_module_mapping maps key (in state_dict) to the # torch.nn.Module wrapping the parameter and the name # of parameter (weight, bias, etc.) key_to_module_mapping[( stage_id, local_module_id, key)] = get_submodule_and_parameter_name( stage_module, key) # load tokenizer state tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) vocab_size = tokenizer.vocab_size local_module_id += 1 module_id += 1 epoch += 1 # build model, and load state dict model_config = { 'vocab_size': vocab_size, 'batch_first': args.batch_first, 'hidden_size': 1024, 'num_layers': args.num_layers, 'dropout': 0.2, 'share_embedding': False } model = GNMT(**model_config) model_state_dict = OrderedDict() for real_key in model.state_dict(): (module, parameter_name) = get_submodule_and_parameter_name( model, real_key) # find key in all_stages_state_dict that corresponds to real_key in # model's state_dict for key in key_to_module_mapping: (module2, parameter_name2) = key_to_module_mapping[key] if parameter_name == parameter_name2 and str(module) == str( module2): break if parameter_name == parameter_name2 and str(module) == str( module2): model_state_dict[real_key] = all_stages_state_dict[key] del key_to_module_mapping[key] del all_stages_state_dict[key] # load state_dict into model, and perform inference model.load_state_dict(model_state_dict) if args.math == 'fp32': dtype = torch.FloatTensor if args.math == 'fp16': dtype = torch.HalfTensor model.type(dtype) model = model.cuda() model.eval() # construct the dataset test_data = TextDataset(src_fname=args.input, tokenizer=tokenizer, sort=False) # build the data loader test_loader = test_data.get_loader(world_size=1, rank=0, batch_size=args.batch_size, batch_first=args.batch_first, shuffle=False, pad=True, num_workers=0) # build the translator object translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir) # execute the inference test_bleu, _ = translator.run(calc_bleu=args.bleu, eval_path=args.output, reference_path=args.reference, summary=True) print(f'{epoch}\t{test_bleu:.2f}')
def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() if args.affinity != 'disabled': nproc_per_node = torch.cuda.device_count() affinity = gpu_affinity.set_affinity(args.local_rank, nproc_per_node, args.affinity) print(f'{args.local_rank}: thread affinity: {affinity}') device = utils.set_device(args.cuda, args.local_rank) utils.init_distributed(args.cuda) args.rank = utils.get_rank() os.makedirs(args.save_dir, exist_ok=True) utils.setup_logging() dllog_file = os.path.join(args.save_dir, args.dllog_file) utils.setup_dllogger(enabled=True, filename=dllog_file) if args.profile: try: pyprof.init(enable_function_stack=True) except NameError: warnings.warn('Called pyprof.init() but pyprof is not available') if args.env: utils.log_env_info() logging.info(f'Run arguments: {args}') dllogger.log(step='PARAMETER', data=vars(args)) if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if not args.cudnn: torch.backends.cudnn.enabled = False # load checkpoint and deserialize to CPU (to save GPU memory) if args.model: checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) # build GNMT model tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) model_config = checkpoint['model_config'] model_config['batch_first'] = args.batch_first model_config['vocab_size'] = tokenizer.vocab_size model = GNMT(**model_config) model.load_state_dict(checkpoint['state_dict']) elif args.synthetic: model = GNMT(args.synthetic_vocab, batch_first=args.batch_first) tokenizer = None else: raise RuntimeError( 'Specify model either with --synthetic or with --model flag') # construct the dataset if args.input: data = RawTextDataset( raw_datafile=args.input, tokenizer=tokenizer, sort=args.sort, ) elif args.input_text: data = RawTextDataset( raw_data=args.input_text, tokenizer=tokenizer, sort=args.sort, ) elif args.synthetic: data = SyntheticDataset(args.synthetic_vocab, args.synthetic_len, args.batch_size[0] * args.synthetic_batches) latency_table = tables.LatencyTable(args.percentiles) throughput_table = tables.ThroughputTable(args.percentiles) accuracy_table = tables.AccuracyTable('BLEU') dtype = { 'fp32': torch.FloatTensor, 'tf32': torch.FloatTensor, 'fp16': torch.HalfTensor } for (math, batch_size, beam_size) in product(args.math, args.batch_size, args.beam_size): logging.info(f'math: {math}, batch size: {batch_size}, ' f'beam size: {beam_size}') model.type(dtype[math]) model = model.to(device) model.eval() # build the data loader loader = data.get_loader( batch_size=batch_size, batch_first=args.batch_first, pad=True, repeat=args.repeat[batch_size], num_workers=0, ) # build the translator object translator = Translator( model=model, tokenizer=tokenizer, loader=loader, beam_size=beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, print_freq=args.print_freq, ) # execute the inference with torch.autograd.profiler.emit_nvtx(enabled=args.profile): output, stats = translator.run( calc_bleu=args.bleu, eval_path=args.output, summary=True, warmup=args.warmup, reference_path=args.reference, ) # print translated outputs if not args.synthetic and (not args.output and args.rank == 0): logging.info(f'Translated output:') for out in output: print(out) key = (batch_size, beam_size) latency_table.add(key, {math: stats['runtimes']}) throughput_table.add(key, {math: stats['throughputs']}) accuracy_table.add(key, {math: stats['bleu']}) if args.tables: accuracy_table.write('Inference accuracy', args.math) if 'fp16' in args.math and 'fp32' in args.math: relative = 'fp32' elif 'fp16' in args.math and 'tf32' in args.math: relative = 'tf32' else: relative = None if 'fp32' in args.math: throughput_table.write('Inference throughput', 'fp32') if 'tf32' in args.math: throughput_table.write('Inference throughput', 'tf32') if 'fp16' in args.math: throughput_table.write('Inference throughput', 'fp16', relative=relative) if 'fp32' in args.math: latency_table.write('Inference latency', 'fp32') if 'tf32' in args.math: latency_table.write('Inference latency', 'tf32') if 'fp16' in args.math: latency_table.write('Inference latency', 'fp16', relative=relative, reverse_speedup=True) avg_throughput = np.array(stats['throughputs']).mean() avg_latency = np.array(stats['runtimes']).mean() summary = { 'eval_throughput': avg_throughput, 'eval_bleu': stats['bleu'], 'eval_avg_latency': avg_latency, } for p in args.percentiles: summary[f'eval_{p}%_latency'] = np.percentile(stats['runtimes'], p) dllogger.log(step=tuple(), data=summary) passed = utils.benchmark(stats['bleu'], args.target_bleu, stats['tokens_per_sec'], args.target_perf) return passed
def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() # initialize distributed backend distributed = args.world_size > 1 if distributed: backend = 'nccl' if args.cuda else 'gloo' dist.init_process_group(backend=backend, rank=args.rank, init_method=args.dist_url, world_size=args.world_size) setup_logging() logging.info(f'Run arguments: {args}') if args.cuda: torch.cuda.set_device(args.rank) if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if args.math == 'fp16' and not args.cuda: raise RuntimeError('fp16 requires cuda') if not args.cudnn: torch.backends.cudnn.enabled = False # load checkpoint and deserialize to CPU (to save GPU memory) checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) # build GNMT model tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) vocab_size = tokenizer.vocab_size model_config = dict(vocab_size=vocab_size, math=checkpoint['config'].math, **literal_eval(checkpoint['config'].model_config)) model_config['batch_first'] = args.batch_first model = GNMT(**model_config) state_dict = checkpoint['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) if args.math == 'fp32': dtype = torch.FloatTensor if args.math == 'fp16': dtype = torch.HalfTensor model.type(dtype) if args.cuda: model = model.cuda() model.eval() # construct the dataset test_data = TextDataset(src_fname=args.input, tokenizer=tokenizer, sort=False) # build the data loader test_loader = test_data.get_loader(batch_size=args.batch_size, batch_first=args.batch_first, shuffle=False, pad=True, num_workers=0, drop_last=False) # build the translator object translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir) # execute the inference translator.run(calc_bleu=args.bleu, eval_path=args.output, reference_path=args.reference, summary=True)
def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() utils.set_device(args.cuda, args.local_rank) utils.init_distributed(args.cuda) setup_logging() if args.env: utils.log_env_info() logging.info(f'Run arguments: {args}') if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if not args.cudnn: torch.backends.cudnn.enabled = False # load checkpoint and deserialize to CPU (to save GPU memory) checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) # build GNMT model tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) vocab_size = tokenizer.vocab_size model_config = checkpoint['model_config'] model_config['batch_first'] = args.batch_first model = GNMT(vocab_size=vocab_size, **model_config) model.load_state_dict(checkpoint['state_dict']) for (math, batch_size, beam_size) in product(args.math, args.batch_size, args.beam_size): logging.info(f'math: {math}, batch size: {batch_size}, ' f'beam size: {beam_size}') if math == 'fp32': dtype = torch.FloatTensor if math == 'fp16': dtype = torch.HalfTensor model.type(dtype) if args.cuda: model = model.cuda() model.eval() # construct the dataset test_data = TextDataset(src_fname=args.input, tokenizer=tokenizer, sort=args.sort) # build the data loader test_loader = test_data.get_loader(batch_size=batch_size, batch_first=args.batch_first, shuffle=False, pad=True, num_workers=0) # build the translator object translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir) # execute the inference translator.run(calc_bleu=args.bleu, eval_path=args.output, reference_path=args.reference, summary=True)