예제 #1
0
def main():
    """
    Launches translation (inference).
    Inference is executed on a single GPU, implementation supports beam search
    with length normalization and coverage penalty.
    """
    args = parse_args()
    device = utils.set_device(args.cuda, args.local_rank)
    utils.init_distributed(args.cuda)
    args.rank = utils.get_rank()
    utils.setup_logging()

    if args.env:
        utils.log_env_info()

    logging.info(f'Run arguments: {args}')

    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # load checkpoint and deserialize to CPU (to save GPU memory)
    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    # build GNMT model
    tokenizer = Tokenizer()
    tokenizer.set_state(checkpoint['tokenizer'])
    model_config = checkpoint['model_config']
    model_config['batch_first'] = args.batch_first
    model_config['vocab_size'] = tokenizer.vocab_size
    model = GNMT(**model_config)
    model.load_state_dict(checkpoint['state_dict'])

    # construct the dataset
    if args.input:
        data = RawTextDataset(
            raw_datafile=args.input,
            tokenizer=tokenizer,
            sort=args.sort,
        )
    elif args.input_text:
        data = RawTextDataset(
            raw_data=args.input_text,
            tokenizer=tokenizer,
            sort=args.sort,
        )

    latency_table = tables.LatencyTable(args.percentiles)
    throughput_table = tables.ThroughputTable(args.percentiles)
    accuracy_table = tables.AccuracyTable('BLEU')

    dtype = {'fp32': torch.FloatTensor, 'fp16': torch.HalfTensor}

    for (math, batch_size, beam_size) in product(args.math, args.batch_size,
                                                 args.beam_size):
        logging.info(f'math: {math}, batch size: {batch_size}, '
                     f'beam size: {beam_size}')

        model.type(dtype[math])
        model = model.to(device)
        model.eval()

        # build the data loader
        loader = data.get_loader(
            batch_size=batch_size,
            batch_first=args.batch_first,
            pad=True,
            repeat=args.repeat[batch_size],
            num_workers=0,
        )

        # build the translator object
        translator = Translator(
            model=model,
            tokenizer=tokenizer,
            loader=loader,
            beam_size=beam_size,
            max_seq_len=args.max_seq_len,
            len_norm_factor=args.len_norm_factor,
            len_norm_const=args.len_norm_const,
            cov_penalty_factor=args.cov_penalty_factor,
            print_freq=args.print_freq,
        )

        # execute the inference
        output, stats = translator.run(
            calc_bleu=args.bleu,
            eval_path=args.output,
            summary=True,
            warmup=args.warmup,
            reference_path=args.reference,
        )

        # print translated outputs
        if not args.output and args.rank == 0:
            logging.info(f'Translated output:')
            for out in output:
                print(out)

        key = (batch_size, beam_size)
        latency_table.add(key, {math: stats['runtimes']})
        throughput_table.add(key, {math: stats['throughputs']})
        accuracy_table.add(key, {math: stats['bleu']})

    if args.tables:
        accuracy_table.write('Inference accuracy', args.math)

        if 'fp16' in args.math and 'fp32' in args.math:
            relative = 'fp32'
        else:
            relative = None

        if 'fp32' in args.math:
            throughput_table.write('Inference throughput', 'fp32')
        if 'fp16' in args.math:
            throughput_table.write('Inference throughput',
                                   'fp16',
                                   relative=relative)

        if 'fp32' in args.math:
            latency_table.write('Inference latency', 'fp32')
        if 'fp16' in args.math:
            latency_table.write('Inference latency',
                                'fp16',
                                relative=relative,
                                reverse_speedup=True)

    passed = utils.benchmark(stats['bleu'], args.target_bleu,
                             stats['tokens_per_sec'], args.target_perf)
    return passed
예제 #2
0
def main():
    """
    Launches translation (inference).
    Inference is executed on a single GPU, implementation supports beam search
    with length normalization and coverage penalty.
    """
    args = parse_args()
    args.batch_first = False

    if args.cuda:
        torch.cuda.set_device(0)
    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if args.math == 'fp16' and not args.cuda:
        raise RuntimeError('fp16 requires cuda')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    num_stages = args.num_stages
    # compute BLEU score for every epoch
    print("Epoch\tBLEU score")
    epoch = 0
    while True:
        # no more epochs to run, since desired file not available
        if not os.path.isfile(
                os.path.join(args.checkpoint_path,
                             f"checkpoint.0.pth.tar.epoch.{epoch}")):
            break

        module = importlib.import_module(args.module)
        model = module.model(None)
        num_modules = len(model)

        key_to_module_mapping = OrderedDict()
        all_stages_state_dict = OrderedDict()
        module_id = 0
        stage_id = 0
        for stage_id in range(num_stages):
            # load the checkpoint associated with a stage
            full_checkpoint_path = os.path.join(
                args.checkpoint_path,
                f"checkpoint.{stage_id}.pth.tar.epoch.{epoch}")
            checkpoint = torch.load(full_checkpoint_path,
                                    map_location=torch.device('cpu'))

            # iterate through all modules in stage_id's checkpoint
            local_module_id = 0

            # quit when checkpoints for all modules in full model are loaded
            while module_id < num_modules:

                # load checkpoint corresponding to different modules in our runtime
                state_dict = checkpoint["state_dict"]
                state_dict_key = "module%d" % local_module_id

                if state_dict_key not in state_dict:
                    break
                state_dict = checkpoint["state_dict"][state_dict_key]

                # remove mask buffer
                keys_to_delete = []
                for key in state_dict:
                    if "mask" in key:
                        keys_to_delete.append(key)
                for key in keys_to_delete:
                    del state_dict[key]

                if checkpoint_from_distributed(state_dict):
                    state_dict = unwrap_distributed(state_dict)

                # collect all state_dicts in a single OrderedDict
                for key in state_dict:
                    all_stages_state_dict[(stage_id, local_module_id,
                                           key)] = state_dict[key]

                stage_module, _, _ = model[module_id]
                for key in state_dict:
                    # key_to_module_mapping maps key (in state_dict) to the
                    # torch.nn.Module wrapping the parameter and the name
                    # of parameter (weight, bias, etc.)
                    key_to_module_mapping[(
                        stage_id, local_module_id,
                        key)] = get_submodule_and_parameter_name(
                            stage_module, key)

                # load tokenizer state
                tokenizer = Tokenizer()
                tokenizer.set_state(checkpoint['tokenizer'])
                vocab_size = tokenizer.vocab_size

                local_module_id += 1
                module_id += 1

        epoch += 1

        # build model, and load state dict
        model_config = {
            'vocab_size': vocab_size,
            'batch_first': args.batch_first,
            'hidden_size': 1024,
            'num_layers': args.num_layers,
            'dropout': 0.2,
            'share_embedding': False
        }
        model = GNMT(**model_config)
        model_state_dict = OrderedDict()
        for real_key in model.state_dict():
            (module, parameter_name) = get_submodule_and_parameter_name(
                model, real_key)
            # find key in all_stages_state_dict that corresponds to real_key in
            # model's state_dict
            for key in key_to_module_mapping:
                (module2, parameter_name2) = key_to_module_mapping[key]
                if parameter_name == parameter_name2 and str(module) == str(
                        module2):
                    break
            if parameter_name == parameter_name2 and str(module) == str(
                    module2):
                model_state_dict[real_key] = all_stages_state_dict[key]
                del key_to_module_mapping[key]
                del all_stages_state_dict[key]

        # load state_dict into model, and perform inference
        model.load_state_dict(model_state_dict)

        if args.math == 'fp32':
            dtype = torch.FloatTensor
        if args.math == 'fp16':
            dtype = torch.HalfTensor

        model.type(dtype)
        model = model.cuda()
        model.eval()

        # construct the dataset
        test_data = TextDataset(src_fname=args.input,
                                tokenizer=tokenizer,
                                sort=False)

        # build the data loader
        test_loader = test_data.get_loader(world_size=1,
                                           rank=0,
                                           batch_size=args.batch_size,
                                           batch_first=args.batch_first,
                                           shuffle=False,
                                           pad=True,
                                           num_workers=0)

        # build the translator object
        translator = Translator(model=model,
                                tokenizer=tokenizer,
                                loader=test_loader,
                                beam_size=args.beam_size,
                                max_seq_len=args.max_seq_len,
                                len_norm_factor=args.len_norm_factor,
                                len_norm_const=args.len_norm_const,
                                cov_penalty_factor=args.cov_penalty_factor,
                                cuda=args.cuda,
                                print_freq=args.print_freq,
                                dataset_dir=args.dataset_dir)

        # execute the inference
        test_bleu, _ = translator.run(calc_bleu=args.bleu,
                                      eval_path=args.output,
                                      reference_path=args.reference,
                                      summary=True)
        print(f'{epoch}\t{test_bleu:.2f}')
예제 #3
0
def main():
    """
    Launches translation (inference).
    Inference is executed on a single GPU, implementation supports beam search
    with length normalization and coverage penalty.
    """
    args = parse_args()
    if args.affinity != 'disabled':
        nproc_per_node = torch.cuda.device_count()
        affinity = gpu_affinity.set_affinity(args.local_rank, nproc_per_node,
                                             args.affinity)
        print(f'{args.local_rank}: thread affinity: {affinity}')
    device = utils.set_device(args.cuda, args.local_rank)
    utils.init_distributed(args.cuda)
    args.rank = utils.get_rank()
    os.makedirs(args.save_dir, exist_ok=True)
    utils.setup_logging()

    dllog_file = os.path.join(args.save_dir, args.dllog_file)
    utils.setup_dllogger(enabled=True, filename=dllog_file)

    if args.profile:
        try:
            pyprof.init(enable_function_stack=True)
        except NameError:
            warnings.warn('Called pyprof.init() but pyprof is not available')

    if args.env:
        utils.log_env_info()

    logging.info(f'Run arguments: {args}')
    dllogger.log(step='PARAMETER', data=vars(args))

    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # load checkpoint and deserialize to CPU (to save GPU memory)
    if args.model:
        checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

        # build GNMT model
        tokenizer = Tokenizer()
        tokenizer.set_state(checkpoint['tokenizer'])
        model_config = checkpoint['model_config']
        model_config['batch_first'] = args.batch_first
        model_config['vocab_size'] = tokenizer.vocab_size
        model = GNMT(**model_config)
        model.load_state_dict(checkpoint['state_dict'])
    elif args.synthetic:
        model = GNMT(args.synthetic_vocab, batch_first=args.batch_first)
        tokenizer = None
    else:
        raise RuntimeError(
            'Specify model either with --synthetic or with --model flag')

    # construct the dataset
    if args.input:
        data = RawTextDataset(
            raw_datafile=args.input,
            tokenizer=tokenizer,
            sort=args.sort,
        )
    elif args.input_text:
        data = RawTextDataset(
            raw_data=args.input_text,
            tokenizer=tokenizer,
            sort=args.sort,
        )
    elif args.synthetic:
        data = SyntheticDataset(args.synthetic_vocab, args.synthetic_len,
                                args.batch_size[0] * args.synthetic_batches)

    latency_table = tables.LatencyTable(args.percentiles)
    throughput_table = tables.ThroughputTable(args.percentiles)
    accuracy_table = tables.AccuracyTable('BLEU')

    dtype = {
        'fp32': torch.FloatTensor,
        'tf32': torch.FloatTensor,
        'fp16': torch.HalfTensor
    }

    for (math, batch_size, beam_size) in product(args.math, args.batch_size,
                                                 args.beam_size):
        logging.info(f'math: {math}, batch size: {batch_size}, '
                     f'beam size: {beam_size}')

        model.type(dtype[math])
        model = model.to(device)
        model.eval()

        # build the data loader
        loader = data.get_loader(
            batch_size=batch_size,
            batch_first=args.batch_first,
            pad=True,
            repeat=args.repeat[batch_size],
            num_workers=0,
        )

        # build the translator object
        translator = Translator(
            model=model,
            tokenizer=tokenizer,
            loader=loader,
            beam_size=beam_size,
            max_seq_len=args.max_seq_len,
            len_norm_factor=args.len_norm_factor,
            len_norm_const=args.len_norm_const,
            cov_penalty_factor=args.cov_penalty_factor,
            print_freq=args.print_freq,
        )

        # execute the inference
        with torch.autograd.profiler.emit_nvtx(enabled=args.profile):
            output, stats = translator.run(
                calc_bleu=args.bleu,
                eval_path=args.output,
                summary=True,
                warmup=args.warmup,
                reference_path=args.reference,
            )

        # print translated outputs
        if not args.synthetic and (not args.output and args.rank == 0):
            logging.info(f'Translated output:')
            for out in output:
                print(out)

        key = (batch_size, beam_size)
        latency_table.add(key, {math: stats['runtimes']})
        throughput_table.add(key, {math: stats['throughputs']})
        accuracy_table.add(key, {math: stats['bleu']})

    if args.tables:
        accuracy_table.write('Inference accuracy', args.math)

        if 'fp16' in args.math and 'fp32' in args.math:
            relative = 'fp32'
        elif 'fp16' in args.math and 'tf32' in args.math:
            relative = 'tf32'
        else:
            relative = None

        if 'fp32' in args.math:
            throughput_table.write('Inference throughput', 'fp32')
        if 'tf32' in args.math:
            throughput_table.write('Inference throughput', 'tf32')
        if 'fp16' in args.math:
            throughput_table.write('Inference throughput',
                                   'fp16',
                                   relative=relative)

        if 'fp32' in args.math:
            latency_table.write('Inference latency', 'fp32')
        if 'tf32' in args.math:
            latency_table.write('Inference latency', 'tf32')
        if 'fp16' in args.math:
            latency_table.write('Inference latency',
                                'fp16',
                                relative=relative,
                                reverse_speedup=True)

    avg_throughput = np.array(stats['throughputs']).mean()
    avg_latency = np.array(stats['runtimes']).mean()
    summary = {
        'eval_throughput': avg_throughput,
        'eval_bleu': stats['bleu'],
        'eval_avg_latency': avg_latency,
    }
    for p in args.percentiles:
        summary[f'eval_{p}%_latency'] = np.percentile(stats['runtimes'], p)

    dllogger.log(step=tuple(), data=summary)

    passed = utils.benchmark(stats['bleu'], args.target_bleu,
                             stats['tokens_per_sec'], args.target_perf)
    return passed
예제 #4
0
def main():
    """
    Launches translation (inference).
    Inference is executed on a single GPU, implementation supports beam search
    with length normalization and coverage penalty.
    """
    args = parse_args()

    # initialize distributed backend
    distributed = args.world_size > 1
    if distributed:
        backend = 'nccl' if args.cuda else 'gloo'
        dist.init_process_group(backend=backend,
                                rank=args.rank,
                                init_method=args.dist_url,
                                world_size=args.world_size)
    setup_logging()
    logging.info(f'Run arguments: {args}')

    if args.cuda:
        torch.cuda.set_device(args.rank)
    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if args.math == 'fp16' and not args.cuda:
        raise RuntimeError('fp16 requires cuda')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # load checkpoint and deserialize to CPU (to save GPU memory)
    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    # build GNMT model
    tokenizer = Tokenizer()
    tokenizer.set_state(checkpoint['tokenizer'])
    vocab_size = tokenizer.vocab_size
    model_config = dict(vocab_size=vocab_size,
                        math=checkpoint['config'].math,
                        **literal_eval(checkpoint['config'].model_config))
    model_config['batch_first'] = args.batch_first
    model = GNMT(**model_config)

    state_dict = checkpoint['state_dict']
    if checkpoint_from_distributed(state_dict):
        state_dict = unwrap_distributed(state_dict)

    model.load_state_dict(state_dict)

    if args.math == 'fp32':
        dtype = torch.FloatTensor
    if args.math == 'fp16':
        dtype = torch.HalfTensor

    model.type(dtype)
    if args.cuda:
        model = model.cuda()
    model.eval()

    # construct the dataset
    test_data = TextDataset(src_fname=args.input,
                            tokenizer=tokenizer,
                            sort=False)

    # build the data loader
    test_loader = test_data.get_loader(batch_size=args.batch_size,
                                       batch_first=args.batch_first,
                                       shuffle=False,
                                       pad=True,
                                       num_workers=0,
                                       drop_last=False)

    # build the translator object
    translator = Translator(model=model,
                            tokenizer=tokenizer,
                            loader=test_loader,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_seq_len,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda,
                            print_freq=args.print_freq,
                            dataset_dir=args.dataset_dir)

    # execute the inference
    translator.run(calc_bleu=args.bleu,
                   eval_path=args.output,
                   reference_path=args.reference,
                   summary=True)
예제 #5
0
def main():
    """
    Launches translation (inference).
    Inference is executed on a single GPU, implementation supports beam search
    with length normalization and coverage penalty.
    """
    args = parse_args()
    utils.set_device(args.cuda, args.local_rank)
    utils.init_distributed(args.cuda)
    setup_logging()

    if args.env:
        utils.log_env_info()

    logging.info(f'Run arguments: {args}')

    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # load checkpoint and deserialize to CPU (to save GPU memory)
    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    # build GNMT model
    tokenizer = Tokenizer()
    tokenizer.set_state(checkpoint['tokenizer'])
    vocab_size = tokenizer.vocab_size
    model_config = checkpoint['model_config']
    model_config['batch_first'] = args.batch_first
    model = GNMT(vocab_size=vocab_size, **model_config)
    model.load_state_dict(checkpoint['state_dict'])

    for (math, batch_size, beam_size) in product(args.math, args.batch_size,
                                                 args.beam_size):
        logging.info(f'math: {math}, batch size: {batch_size}, '
                     f'beam size: {beam_size}')
        if math == 'fp32':
            dtype = torch.FloatTensor
        if math == 'fp16':
            dtype = torch.HalfTensor
        model.type(dtype)

        if args.cuda:
            model = model.cuda()
        model.eval()

        # construct the dataset
        test_data = TextDataset(src_fname=args.input,
                                tokenizer=tokenizer,
                                sort=args.sort)

        # build the data loader
        test_loader = test_data.get_loader(batch_size=batch_size,
                                           batch_first=args.batch_first,
                                           shuffle=False,
                                           pad=True,
                                           num_workers=0)

        # build the translator object
        translator = Translator(model=model,
                                tokenizer=tokenizer,
                                loader=test_loader,
                                beam_size=beam_size,
                                max_seq_len=args.max_seq_len,
                                len_norm_factor=args.len_norm_factor,
                                len_norm_const=args.len_norm_const,
                                cov_penalty_factor=args.cov_penalty_factor,
                                cuda=args.cuda,
                                print_freq=args.print_freq,
                                dataset_dir=args.dataset_dir)

        # execute the inference
        translator.run(calc_bleu=args.bleu,
                       eval_path=args.output,
                       reference_path=args.reference,
                       summary=True)