Пример #1
0
    def __init__(self, dataset, batch_size, pad, world_size=None, rank=None):
        if world_size is None:
            world_size = get_world_size()
        if rank is None:
            rank = get_rank()

        self.world_size = world_size

        global_batch_size = batch_size * world_size

        data_len = len(dataset)
        num_samples = (data_len + global_batch_size - 1) \
            // global_batch_size * global_batch_size
        self.num_samples = num_samples

        indices = list(range(data_len))
        if pad:
            indices += [0] * (num_samples - len(indices))
        else:
            indices += [-1] * (num_samples - len(indices))
        indices = torch.tensor(indices)

        indices = indices.view(-1, batch_size)
        indices = indices[rank::world_size].contiguous()
        indices = indices.view(-1)
        indices = indices[indices != -1]
        indices = indices.tolist()
        self.indices = indices
Пример #2
0
    def __init__(self, dataset, batch_size, seeds, world_size=None, rank=None):
        """
        Constructor for the DistributedSampler.

        :param dataset: dataset
        :param batch_size: local batch size
        :param seeds: list of seeds, one seed for each training epoch
        :param world_size: number of distributed workers
        :param rank: rank of the current process
        """
        if world_size is None:
            world_size = get_world_size()
        if rank is None:
            rank = get_rank()

        self.dataset = dataset
        self.world_size = world_size
        self.rank = rank
        self.epoch = 0
        self.seeds = seeds

        self.batch_size = batch_size
        self.global_batch_size = batch_size * world_size

        self.data_len = len(self.dataset)

        self.num_samples = self.data_len // self.global_batch_size \
            * self.global_batch_size
Пример #3
0
    def __init__(self, dataset, batch_size, seeds, bucketing=True,
                 world_size=None, rank=None):
        """
        Constructor for the BucketingSampler.

        :param dataset: dataset
        :param batch_size: batch size
        :param bucketing: if True enables bucketing by sequence length
        :param world_size: number of processes participating in distributed
            training
        :param rank: rank of the current process within world_size
        """
        if world_size is None:
            world_size = get_world_size()
        if rank is None:
            rank = get_rank()

        self.dataset = dataset
        self.world_size = world_size
        self.rank = rank
        self.epoch = 0
        self.bucketing = bucketing
        self.seeds = seeds

        self.batch_size = batch_size
        self.global_batch_size = batch_size * world_size

        self.data_len = len(self.dataset)
        self.num_samples = self.data_len // self.global_batch_size \
            * self.global_batch_size
Пример #4
0
    def run(self, calc_bleu=True, epoch=None, iteration=None, eval_path=None,
            summary=False, reference_path=None):
        """
        Runs translation on test dataset.

        :param calc_bleu: if True compares results with reference and computes
            BLEU score
        :param epoch: index of the current epoch
        :param iteration: index of the current iteration
        :param eval_path: path to the file for saving results
        :param summary: if True prints summary
        :param reference_path: path to the file with reference translation
        """
        if self.cuda:
            test_bleu = torch.cuda.FloatTensor([0])
            break_training = torch.cuda.LongTensor([0])
        else:
            test_bleu = torch.FloatTensor([0])
            break_training = torch.LongTensor([0])

        if eval_path is None:
            eval_path = self.build_eval_path(epoch, iteration)
        detok_eval_path = eval_path + '.detok'

        with contextlib.suppress(FileNotFoundError):
            os.remove(eval_path)
            os.remove(detok_eval_path)

        rank = get_rank()
        logging.info(f'Running evaluation on test set')
        self.model.eval()
        torch.cuda.empty_cache()

        output = self.evaluate(epoch, iteration, summary)
        output = output[:len(self.loader.dataset)]

        if rank == 0:
            with open(eval_path, 'a') as eval_file:
                eval_file.writelines(output)
            if calc_bleu:
                self.run_detokenizer(eval_path)
                test_bleu[0] = self.run_sacrebleu(detok_eval_path, reference_path)
                if summary:
                    logging.info(f'BLEU on test dataset: {test_bleu[0]:.2f}')

                if self.target_bleu and test_bleu[0] >= self.target_bleu:
                    logging.info(f'Target accuracy reached')
                    break_training[0] = 1

        barrier()
        torch.cuda.empty_cache()
        logging.info(f'Finished evaluation on test set')

        if self.distributed:
            dist.broadcast(break_training, 0)
            dist.broadcast(test_bleu, 0)

        return test_bleu[0].item(), break_training[0].item()
    def run(self,
            calc_bleu=True,
            epoch=None,
            iteration=None,
            eval_path=None,
            summary=False,
            warmup=0,
            reference_path=None):
        """
        Runs translation on test dataset.

        :param calc_bleu: if True compares results with reference and computes
            BLEU score
        :param epoch: index of the current epoch
        :param iteration: index of the current iteration
        :param eval_path: path to the file for saving results
        :param summary: if True prints summary
        :param reference_path: path to the file with reference translation
        """
        if reference_path is None:
            reference_path = self.reference

        device = next(self.model.parameters()).device

        test_bleu = torch.tensor([0.], device=device)

        rank = utils.get_rank()
        logging.info(f'Running evaluation on test set')
        self.model.eval()

        output, eval_stats = self.evaluate(self.loader, epoch, iteration,
                                           warmup, summary)
        output = output[:len(self.loader.dataset)]
        output = self.loader.dataset.unsort(output)

        if rank == 0 and eval_path:
            with open(eval_path, 'w') as eval_file:
                lines = [line + '\n' for line in output]
                eval_file.writelines(lines)
            if calc_bleu:
                test_bleu[0] = run_sacrebleu(eval_path, reference_path)
                if summary:
                    logging.info(f'BLEU on test dataset: {test_bleu[0]:.2f}')

        utils.barrier()
        logging.info(f'Finished evaluation on test set')

        if self.distributed:
            dist.broadcast(test_bleu, 0)

        if calc_bleu:
            eval_stats['bleu'] = test_bleu[0].item()
        else:
            eval_stats['bleu'] = None

        return output, eval_stats
Пример #6
0
    def __init__(self,
                 dataset,
                 batch_size,
                 pad,
                 repeat=1,
                 world_size=None,
                 rank=None):
        """
        Constructor for the StaticDistributedSampler.

        :param dataset: dataset
        :param batch_size: local batch size
        :param pad: if True: pads dataset to a multiple of global_batch_size
            samples
        :param world_size: number of distributed workers
        :param rank: rank of the current process
        """
        if world_size is None:
            world_size = get_world_size()
        if rank is None:
            rank = get_rank()

        self.world_size = world_size

        global_batch_size = batch_size * world_size

        data_len = len(dataset)
        repeated_data_len = int(len(dataset) * repeat)
        num_samples = (repeated_data_len + global_batch_size - 1) \
            // global_batch_size * global_batch_size
        self.num_samples = num_samples

        indices = list(range(repeated_data_len))
        if pad:
            # pad dataset to a multiple of global_batch_size samples, uses
            # sample with idx 0 as pad
            indices += [0] * (num_samples - len(indices))
        else:
            # temporary pad to a multiple of global batch size, pads with "-1"
            # which is later removed from the list of indices
            indices += [-1] * (num_samples - len(indices))
        indices = torch.tensor(indices)

        indices = indices.view(-1, batch_size)
        indices = indices[rank::world_size].contiguous()
        indices = indices.view(-1)
        # remove temporary pad
        indices = indices[indices != -1]
        indices = indices % data_len
        indices = indices.tolist()
        self.indices = indices
Пример #7
0
    def __init__(self, dataset, batch_size, bucket=True, world_size=None, rank=None):
        if world_size is None:
            world_size = get_world_size()
        if rank is None:
            rank = get_rank()

        self.dataset = dataset
        self.world_size = world_size
        self.rank = rank
        self.epoch = 0
        self.bucket = bucket

        self.batch_size = batch_size
        self.global_batch_size = batch_size * world_size

        self.data_len = len(self.dataset)
        self.num_samples = self.data_len // self.global_batch_size \
            * self.global_batch_size
Пример #8
0
def main():
    args = parse_args()

    use_cuda = True
    device = utils.set_device(use_cuda, args.local_rank)
    distributed = utils.init_distributed(use_cuda)
    rank = utils.get_rank()
    world_size = utils.get_world_size()

    utils.setup_logging()
    logging.info(f'Run arguments: {args}')

    pad_vocab = utils.pad_vocabulary(args.math)
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME),
                          pad_vocab)

    # Pre-process dataset only on master node
    if rank == 0:
        srcs, tgts, src_lengths, tgt_lengths = load_dataset(tokenizer, args)
    else:
        srcs, tgts, src_lengths, tgt_lengths = None, None, None, None
        time.sleep(30)

    # Broadcast preprocessed dataset to other ranks
    if world_size > 1:
        srcs, tgts, src_lengths, tgt_lengths = broadcast_dataset(
            world_size, rank, args.max_length_train, srcs, tgts, src_lengths,
            tgt_lengths)

    preproc_train_data = PreprocessedDataset(
        min_len=args.min_length_train,
        max_len=args.max_length_train,
        vocab_size=tokenizer.vocab_size,
    )
    os.makedirs(args.preproc_data_dir, exist_ok=True)
    preproc_train_data.write_data(
        os.path.join(args.preproc_data_dir, 'training.bin'),
        (srcs, src_lengths),
        (tgts, tgt_lengths),
    )
Пример #9
0
def main():
    """
    Launches translation (inference).
    Inference is executed on a single GPU, implementation supports beam search
    with length normalization and coverage penalty.
    """
    args = parse_args()
    device = utils.set_device(args.cuda, args.local_rank)
    utils.init_distributed(args.cuda)
    args.rank = utils.get_rank()
    utils.setup_logging()

    if args.env:
        utils.log_env_info()

    logging.info(f'Run arguments: {args}')

    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # load checkpoint and deserialize to CPU (to save GPU memory)
    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    # build GNMT model
    tokenizer = Tokenizer()
    tokenizer.set_state(checkpoint['tokenizer'])
    model_config = checkpoint['model_config']
    model_config['batch_first'] = args.batch_first
    model_config['vocab_size'] = tokenizer.vocab_size
    model = GNMT(**model_config)
    model.load_state_dict(checkpoint['state_dict'])

    # construct the dataset
    if args.input:
        data = RawTextDataset(
            raw_datafile=args.input,
            tokenizer=tokenizer,
            sort=args.sort,
        )
    elif args.input_text:
        data = RawTextDataset(
            raw_data=args.input_text,
            tokenizer=tokenizer,
            sort=args.sort,
        )

    latency_table = tables.LatencyTable(args.percentiles)
    throughput_table = tables.ThroughputTable(args.percentiles)
    accuracy_table = tables.AccuracyTable('BLEU')

    dtype = {'fp32': torch.FloatTensor, 'fp16': torch.HalfTensor}

    for (math, batch_size, beam_size) in product(args.math, args.batch_size,
                                                 args.beam_size):
        logging.info(f'math: {math}, batch size: {batch_size}, '
                     f'beam size: {beam_size}')

        model.type(dtype[math])
        model = model.to(device)
        model.eval()

        # build the data loader
        loader = data.get_loader(
            batch_size=batch_size,
            batch_first=args.batch_first,
            pad=True,
            repeat=args.repeat[batch_size],
            num_workers=0,
        )

        # build the translator object
        translator = Translator(
            model=model,
            tokenizer=tokenizer,
            loader=loader,
            beam_size=beam_size,
            max_seq_len=args.max_seq_len,
            len_norm_factor=args.len_norm_factor,
            len_norm_const=args.len_norm_const,
            cov_penalty_factor=args.cov_penalty_factor,
            print_freq=args.print_freq,
        )

        # execute the inference
        output, stats = translator.run(
            calc_bleu=args.bleu,
            eval_path=args.output,
            summary=True,
            warmup=args.warmup,
            reference_path=args.reference,
        )

        # print translated outputs
        if not args.output and args.rank == 0:
            logging.info(f'Translated output:')
            for out in output:
                print(out)

        key = (batch_size, beam_size)
        latency_table.add(key, {math: stats['runtimes']})
        throughput_table.add(key, {math: stats['throughputs']})
        accuracy_table.add(key, {math: stats['bleu']})

    if args.tables:
        accuracy_table.write('Inference accuracy', args.math)

        if 'fp16' in args.math and 'fp32' in args.math:
            relative = 'fp32'
        else:
            relative = None

        if 'fp32' in args.math:
            throughput_table.write('Inference throughput', 'fp32')
        if 'fp16' in args.math:
            throughput_table.write('Inference throughput',
                                   'fp16',
                                   relative=relative)

        if 'fp32' in args.math:
            latency_table.write('Inference latency', 'fp32')
        if 'fp16' in args.math:
            latency_table.write('Inference latency',
                                'fp16',
                                relative=relative,
                                reverse_speedup=True)

    passed = utils.benchmark(stats['bleu'], args.target_bleu,
                             stats['tokens_per_sec'], args.target_perf)
    return passed
Пример #10
0
def main():
    """
    Launches data-parallel multi-gpu training.
    """
    training_start = time.time()
    args = parse_args()
    device = utils.set_device(args.cuda, args.local_rank)
    utils.init_distributed(args.cuda)
    args.rank = utils.get_rank()

    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # create directory for results
    os.makedirs(args.save_dir, exist_ok=True)

    # setup logging
    log_filename = f'log_rank_{utils.get_rank()}.log'
    utils.setup_logging(args.log_all_ranks,
                        os.path.join(args.save_dir, log_filename))

    if args.env:
        utils.log_env_info()

    logging.info(f'Saving results to: {args.save_dir}')
    logging.info(f'Run arguments: {args}')

    args.train_iter_size = set_iter_size(args.train_iter_size,
                                         args.train_global_batch_size,
                                         args.train_batch_size)

    worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs,
                                                      device)
    worker_seed = worker_seeds[args.rank]
    logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}')
    torch.manual_seed(worker_seed)

    # build tokenizer
    pad_vocab = utils.pad_vocabulary(args.math)
    tokenizer = Tokenizer(args.vocab, args.bpe_codes, args.lang, pad_vocab)

    # build datasets
    train_data = LazyParallelDataset(
        src_fname=args.train_src,
        tgt_fname=args.train_tgt,
        tokenizer=tokenizer,
        min_len=args.train_min_length,
        max_len=args.train_max_length,
        sort=False,
        max_size=args.train_max_size,
    )

    val_data = ParallelDataset(
        src_fname=args.val_src,
        tgt_fname=args.val_tgt,
        tokenizer=tokenizer,
        min_len=args.val_min_length,
        max_len=args.val_max_length,
        sort=True,
    )

    test_data = TextDataset(
        src_fname=args.test_src,
        tokenizer=tokenizer,
        min_len=args.test_min_length,
        max_len=args.test_max_length,
        sort=True,
    )

    vocab_size = tokenizer.vocab_size

    # build GNMT model
    model_config = {
        'hidden_size': args.hidden_size,
        'vocab_size': vocab_size,
        'num_layers': args.num_layers,
        'dropout': args.dropout,
        'batch_first': False,
        'share_embedding': args.share_embedding,
    }
    model = GNMT(**model_config).to(device)
    logging.info(model)

    batch_first = model.batch_first

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD,
                                args.smoothing).to(device)

    opt_config = {'optimizer': args.optimizer, 'lr': args.lr}
    opt_config.update(literal_eval(args.optimizer_extra))
    logging.info(f'Training optimizer config: {opt_config}')

    scheduler_config = {
        'warmup_steps': args.warmup_steps,
        'remain_steps': args.remain_steps,
        'decay_interval': args.decay_interval,
        'decay_steps': args.decay_steps,
        'decay_factor': args.decay_factor
    }

    logging.info(f'Training LR schedule config: {scheduler_config}')

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info(f'Number of parameters: {num_parameters}')

    batching_opt = {
        'shard_size': args.shard_size,
        'num_buckets': args.num_buckets
    }
    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.train_batch_size,
                                         seeds=shuffling_seeds,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         batching=args.batching,
                                         batching_opt=batching_opt,
                                         num_workers=args.train_loader_workers)

    val_loader = val_data.get_loader(batch_size=args.val_batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     num_workers=args.val_loader_workers)

    test_loader = test_data.get_loader(batch_size=args.test_batch_size,
                                       batch_first=batch_first,
                                       shuffle=False,
                                       pad=True,
                                       num_workers=args.test_loader_workers)

    translator = Translator(
        model=model,
        tokenizer=tokenizer,
        loader=test_loader,
        beam_size=args.beam_size,
        max_seq_len=args.test_max_length,
        len_norm_factor=args.len_norm_factor,
        len_norm_const=args.len_norm_const,
        cov_penalty_factor=args.cov_penalty_factor,
        print_freq=args.print_freq,
        reference=args.test_tgt,
    )

    # create trainer
    total_train_iters = len(train_loader) // args.train_iter_size * args.epochs
    save_info = {
        'model_config': model_config,
        'config': args,
        'tokenizer': tokenizer.get_state()
    }
    loss_scaling = {
        'init_scale': args.init_scale,
        'upscale_interval': args.upscale_interval
    }
    trainer_options = dict(
        model=model,
        criterion=criterion,
        grad_clip=args.grad_clip,
        iter_size=args.train_iter_size,
        save_dir=args.save_dir,
        save_freq=args.save_freq,
        save_info=save_info,
        opt_config=opt_config,
        scheduler_config=scheduler_config,
        train_iterations=total_train_iters,
        keep_checkpoints=args.keep_checkpoints,
        math=args.math,
        loss_scaling=loss_scaling,
        print_freq=args.print_freq,
        intra_epoch_eval=args.intra_epoch_eval,
        translator=translator,
        prealloc_mode=args.prealloc_mode,
    )

    trainer = trainers.Seq2SeqTrainer(**trainer_options)

    # optionally resume from a checkpoint
    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error(f'No checkpoint found at {args.resume}')

    # training loop
    best_loss = float('inf')
    training_perf = []
    break_training = False
    test_bleu = None
    for epoch in range(args.start_epoch, args.epochs):
        logging.info(f'Starting epoch {epoch}')

        train_loader.sampler.set_epoch(epoch)

        trainer.epoch = epoch
        train_loss, train_perf = trainer.optimize(train_loader)
        training_perf.append(train_perf)

        # evaluate on validation set
        if args.eval:
            logging.info(f'Running validation on dev set')
            val_loss, val_perf = trainer.evaluate(val_loader)

            # remember best prec@1 and save checkpoint
            if args.rank == 0:
                is_best = val_loss < best_loss
                best_loss = min(val_loss, best_loss)
                trainer.save(save_all=args.save_all, is_best=is_best)

        if args.eval:
            utils.barrier()
            eval_fname = f'eval_epoch_{epoch}'
            eval_path = os.path.join(args.save_dir, eval_fname)
            _, eval_stats = translator.run(
                calc_bleu=True,
                epoch=epoch,
                eval_path=eval_path,
            )
            test_bleu = eval_stats['bleu']
            if args.target_bleu and test_bleu >= args.target_bleu:
                logging.info(f'Target accuracy reached')
                break_training = True

        acc_log = []
        acc_log += [f'Summary: Epoch: {epoch}']
        acc_log += [f'Training Loss: {train_loss:.4f}']
        if args.eval:
            acc_log += [f'Validation Loss: {val_loss:.4f}']
            acc_log += [f'Test BLEU: {test_bleu:.2f}']

        perf_log = []
        perf_log += [f'Performance: Epoch: {epoch}']
        perf_log += [f'Training: {train_perf:.0f} Tok/s']
        if args.eval:
            perf_log += [f'Validation: {val_perf:.0f} Tok/s']

        if args.rank == 0:
            logging.info('\t'.join(acc_log))
            logging.info('\t'.join(perf_log))

        logging.info(f'Finished epoch {epoch}')
        if break_training:
            break

    utils.barrier()
    training_stop = time.time()
    training_time = training_stop - training_start
    logging.info(f'Total training time {training_time:.0f} s')

    table = TrainingTable()
    avg_training_perf = sum(training_perf) / len(training_perf)
    table.add(utils.get_world_size(), args.train_batch_size, test_bleu,
              avg_training_perf, training_time)
    if utils.get_rank() == 0:
        table.write('Training Summary', args.math)

    passed = utils.benchmark(test_bleu, args.target_bleu, train_perf,
                             args.target_perf)
    if not passed:
        sys.exit(1)
Пример #11
0
def main():
    """
    Launches data-parallel multi-gpu training.
    """
    mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__))
    mlperf_log.LOGGER.propagate = False

    args = parse_args()

    if args.cuda:
        torch.cuda.set_device(args.local_rank)
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    # initialize distributed backend
    distributed = False
    if 'WORLD_SIZE' in os.environ:
        distributed = int(os.environ['WORLD_SIZE']) > 1

    if distributed:
        assert args.cuda
        '''Initialize distributed communication'''
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        assert torch.distributed.is_initialized()

    gnmt_print(key=mlperf_log.RUN_START)

    args.rank = get_rank()

    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # create directory for results
    save_path = os.path.join(args.results_dir, args.save)
    args.save_path = save_path
    os.makedirs(save_path, exist_ok=True)

    # setup logging
    log_filename = f'log_gpu_{args.rank}.log'
    setup_logging(os.path.join(save_path, log_filename))

    logging.info(f'Saving results to: {save_path}')
    logging.info(f'Run arguments: {args}')

    # setup L2 promotion
    if args.cuda:
        l2_promote()

    gnmt_print(key=mlperf_log.RUN_SET_RANDOM_SEED)
    # https://github.com/mlperf/policies/issues/120#issuecomment-431111348
    if args.seed is None:
        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
        master_seed = random.SystemRandom().randint(0, 2**32 - 1)
        if get_rank() == 0:
            # master seed is reported only from rank=0 worker, it's to avoid
            # confusion, seeds from rank=0 are later broadcasted to other
            # workers
            logging.info(f'Using random master seed: {master_seed}')
    else:
        # master seed was specified from command line
        master_seed = args.seed
        logging.info(f'Using master seed from command line: {master_seed}')

    # initialize seeding RNG
    seeding_rng = random.Random(master_seed)

    # generate worker seeds, one seed for every distributed worker
    worker_seeds = generate_seeds(seeding_rng, get_world_size())

    # generate seeds for data shuffling, one seed for every epoch
    shuffling_seeds = generate_seeds(seeding_rng, args.epochs)

    # broadcast seeds from rank=0 to other workers
    worker_seeds = broadcast_seeds(worker_seeds, device)
    shuffling_seeds = broadcast_seeds(shuffling_seeds, device)

    # set worker seed
    worker_seed = worker_seeds[args.rank]
    logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}')
    torch.manual_seed(worker_seed)

    # build tokenizer
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME))

    # build datasets
    gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING)
    gnmt_print(key=mlperf_log.TRAIN_HP_MAX_SEQ_LEN,
               value=args.max_length_train)

    train_data = LazyParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_train,
        max_len=args.max_length_train,
        sort=False,
        max_size=args.max_size)

    gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES,
               value=len(train_data))

    val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir,
                                                      config.SRC_VAL_FNAME),
                               tgt_fname=os.path.join(args.dataset_dir,
                                                      config.TGT_VAL_FNAME),
                               tokenizer=tokenizer,
                               min_len=args.min_length_val,
                               max_len=args.max_length_val,
                               sort=True)

    gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL)

    test_data = TextDataset(src_fname=os.path.join(args.dataset_dir,
                                                   config.SRC_TEST_FNAME),
                            tokenizer=tokenizer,
                            min_len=args.min_length_test,
                            max_len=args.max_length_test,
                            sort=False)

    gnmt_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=len(test_data))

    vocab_size = tokenizer.vocab_size
    # size of the vocabulary has been padded to a multiple of 8
    gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size)

    # build GNMT model
    model_config = dict(vocab_size=vocab_size,
                        math=args.math,
                        **literal_eval(args.model_config))
    model = GNMT(**model_config)
    logging.info(model)

    batch_first = model.batch_first

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD, args.smoothing)
    opt_config = literal_eval(args.optimization_config)
    scheduler_config = literal_eval(args.scheduler_config)
    logging.info(f'Training optimizer: {opt_config}')
    logging.info(f'Training LR Schedule: {scheduler_config}')

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info(f'Number of parameters: {num_parameters}')

    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.batch_size,
                                         seeds=shuffling_seeds,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         bucketing=args.bucketing,
                                         num_workers=args.train_loader_workers)

    gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE,
               value=args.batch_size * get_world_size())
    gnmt_print(key=mlperf_log.INPUT_SIZE,
               value=train_loader.sampler.num_samples)

    val_loader = val_data.get_loader(batch_size=args.val_batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     num_workers=args.val_loader_workers)

    test_loader = test_data.get_loader(batch_size=args.test_batch_size,
                                       batch_first=batch_first,
                                       shuffle=False,
                                       pad=True,
                                       num_workers=args.test_loader_workers)

    gnmt_print(key=mlperf_log.EVAL_SIZE, value=len(test_loader.dataset))

    translator = Translator(model=model,
                            tokenizer=tokenizer,
                            loader=test_loader,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_length_test,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda,
                            print_freq=args.print_freq,
                            dataset_dir=args.dataset_dir,
                            target_bleu=args.target_bleu,
                            save_path=args.save_path)

    # create trainer
    trainer_options = dict(
        criterion=criterion,
        grad_clip=args.grad_clip,
        save_path=save_path,
        save_freq=args.save_freq,
        save_info={
            'config': args,
            'tokenizer': tokenizer.get_state()
        },
        opt_config=opt_config,
        scheduler_config=scheduler_config,
        batch_first=batch_first,
        keep_checkpoints=args.keep_checkpoints,
        math=args.math,
        print_freq=args.print_freq,
        cuda=args.cuda,
        distributed=distributed,
        distributed_overlap_allreduce=args.enable_apex_allreduce_overlap,
        distributed_overlap_allreduce_messagesize=args.apex_message_size,
        intra_epoch_eval=args.intra_epoch_eval,
        translator=translator,
        arch=args.arch)

    trainer_options['model'] = model
    trainer = trainers.Seq2SeqTrainer(**trainer_options)

    # optionally resume from a checkpoint
    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error(f'No checkpoint found at {args.resume}')

    # training loop
    # best_loss = float('inf')
    gnmt_print(key=mlperf_log.TRAIN_LOOP)

    for epoch in range(1):
        logging.info(f'Starting epoch {epoch}')
        gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)

        if distributed:
            train_loader.sampler.set_epoch(epoch)

        trainer.epoch = epoch
        train_loss, train_perf = trainer.optimize(train_loader)

        logging.info(f'Finished epoch {epoch}')

    # Save the checkpoint at the end of the training loop, after the RUN_STOP
    # tag
    # https://github.com/mlperf/policies/issues/55#issuecomment-428335773
    if not args.disable_eval:
        gnmt_print(key=mlperf_log.TRAIN_CHECKPOINT)
        if get_rank() == 0:
            trainer.save(save_all=args.save_all, is_best=True)

    gnmt_print(key=mlperf_log.RUN_FINAL)
Пример #12
0
def main():
    """
    Launches translation (inference).
    Inference is executed on a single GPU, implementation supports beam search
    with length normalization and coverage penalty.
    """
    args = parse_args()
    if args.affinity != 'disabled':
        nproc_per_node = torch.cuda.device_count()
        affinity = gpu_affinity.set_affinity(args.local_rank, nproc_per_node,
                                             args.affinity)
        print(f'{args.local_rank}: thread affinity: {affinity}')
    device = utils.set_device(args.cuda, args.local_rank)
    utils.init_distributed(args.cuda)
    args.rank = utils.get_rank()
    os.makedirs(args.save_dir, exist_ok=True)
    utils.setup_logging()

    dllog_file = os.path.join(args.save_dir, args.dllog_file)
    utils.setup_dllogger(enabled=True, filename=dllog_file)

    if args.profile:
        try:
            pyprof.init(enable_function_stack=True)
        except NameError:
            warnings.warn('Called pyprof.init() but pyprof is not available')

    if args.env:
        utils.log_env_info()

    logging.info(f'Run arguments: {args}')
    dllogger.log(step='PARAMETER', data=vars(args))

    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # load checkpoint and deserialize to CPU (to save GPU memory)
    if args.model:
        checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

        # build GNMT model
        tokenizer = Tokenizer()
        tokenizer.set_state(checkpoint['tokenizer'])
        model_config = checkpoint['model_config']
        model_config['batch_first'] = args.batch_first
        model_config['vocab_size'] = tokenizer.vocab_size
        model = GNMT(**model_config)
        model.load_state_dict(checkpoint['state_dict'])
    elif args.synthetic:
        model = GNMT(args.synthetic_vocab, batch_first=args.batch_first)
        tokenizer = None
    else:
        raise RuntimeError(
            'Specify model either with --synthetic or with --model flag')

    # construct the dataset
    if args.input:
        data = RawTextDataset(
            raw_datafile=args.input,
            tokenizer=tokenizer,
            sort=args.sort,
        )
    elif args.input_text:
        data = RawTextDataset(
            raw_data=args.input_text,
            tokenizer=tokenizer,
            sort=args.sort,
        )
    elif args.synthetic:
        data = SyntheticDataset(args.synthetic_vocab, args.synthetic_len,
                                args.batch_size[0] * args.synthetic_batches)

    latency_table = tables.LatencyTable(args.percentiles)
    throughput_table = tables.ThroughputTable(args.percentiles)
    accuracy_table = tables.AccuracyTable('BLEU')

    dtype = {
        'fp32': torch.FloatTensor,
        'tf32': torch.FloatTensor,
        'fp16': torch.HalfTensor
    }

    for (math, batch_size, beam_size) in product(args.math, args.batch_size,
                                                 args.beam_size):
        logging.info(f'math: {math}, batch size: {batch_size}, '
                     f'beam size: {beam_size}')

        model.type(dtype[math])
        model = model.to(device)
        model.eval()

        # build the data loader
        loader = data.get_loader(
            batch_size=batch_size,
            batch_first=args.batch_first,
            pad=True,
            repeat=args.repeat[batch_size],
            num_workers=0,
        )

        # build the translator object
        translator = Translator(
            model=model,
            tokenizer=tokenizer,
            loader=loader,
            beam_size=beam_size,
            max_seq_len=args.max_seq_len,
            len_norm_factor=args.len_norm_factor,
            len_norm_const=args.len_norm_const,
            cov_penalty_factor=args.cov_penalty_factor,
            print_freq=args.print_freq,
        )

        # execute the inference
        with torch.autograd.profiler.emit_nvtx(enabled=args.profile):
            output, stats = translator.run(
                calc_bleu=args.bleu,
                eval_path=args.output,
                summary=True,
                warmup=args.warmup,
                reference_path=args.reference,
            )

        # print translated outputs
        if not args.synthetic and (not args.output and args.rank == 0):
            logging.info(f'Translated output:')
            for out in output:
                print(out)

        key = (batch_size, beam_size)
        latency_table.add(key, {math: stats['runtimes']})
        throughput_table.add(key, {math: stats['throughputs']})
        accuracy_table.add(key, {math: stats['bleu']})

    if args.tables:
        accuracy_table.write('Inference accuracy', args.math)

        if 'fp16' in args.math and 'fp32' in args.math:
            relative = 'fp32'
        elif 'fp16' in args.math and 'tf32' in args.math:
            relative = 'tf32'
        else:
            relative = None

        if 'fp32' in args.math:
            throughput_table.write('Inference throughput', 'fp32')
        if 'tf32' in args.math:
            throughput_table.write('Inference throughput', 'tf32')
        if 'fp16' in args.math:
            throughput_table.write('Inference throughput',
                                   'fp16',
                                   relative=relative)

        if 'fp32' in args.math:
            latency_table.write('Inference latency', 'fp32')
        if 'tf32' in args.math:
            latency_table.write('Inference latency', 'tf32')
        if 'fp16' in args.math:
            latency_table.write('Inference latency',
                                'fp16',
                                relative=relative,
                                reverse_speedup=True)

    avg_throughput = np.array(stats['throughputs']).mean()
    avg_latency = np.array(stats['runtimes']).mean()
    summary = {
        'eval_throughput': avg_throughput,
        'eval_bleu': stats['bleu'],
        'eval_avg_latency': avg_latency,
    }
    for p in args.percentiles:
        summary[f'eval_{p}%_latency'] = np.percentile(stats['runtimes'], p)

    dllogger.log(step=tuple(), data=summary)

    passed = utils.benchmark(stats['bleu'], args.target_bleu,
                             stats['tokens_per_sec'], args.target_perf)
    return passed
    def evaluate(self, epoch, iteration, summary):
        """
        Runs evaluation on test dataset.

        :param epoch: index of the current epoch
        :param iteration: index of the current iteration
        :param summary: if True prints summary
        """
        batch_time = AverageMeter(False)
        tot_tok_per_sec = AverageMeter(False)
        iterations = AverageMeter(False)
        enc_seq_len = AverageMeter(False)
        dec_seq_len = AverageMeter(False)
        stats = {}

        output = []

        for i, (src, indices) in enumerate(self.loader):
            translate_timer = time.time()
            src, src_length = src

            batch_size = self.loader.batch_size
            global_batch_size = batch_size * get_world_size()
            beam_size = self.beam_size

            bos = [self.insert_target_start] * (batch_size * beam_size)
            bos = torch.LongTensor(bos)
            if self.batch_first:
                bos = bos.view(-1, 1)
            else:
                bos = bos.view(1, -1)

            src_length = torch.LongTensor(src_length)
            stats['total_enc_len'] = int(src_length.sum())

            if self.cuda:
                src = src.cuda()
                src_length = src_length.cuda()
                bos = bos.cuda()

            with torch.no_grad():
                context = self.model.encode(src, src_length)
                context = [context, src_length, None]

                if beam_size == 1:
                    generator = self.generator.greedy_search
                else:
                    generator = self.generator.beam_search
                preds, lengths, counter = generator(batch_size, bos, context)

            stats['total_dec_len'] = lengths.sum().item()
            stats['iters'] = counter

            indices = torch.tensor(indices).to(preds)
            preds = preds.scatter(0,
                                  indices.unsqueeze(1).expand_as(preds), preds)

            preds = gather_predictions(preds).cpu()

            for pred in preds:
                pred = pred.tolist()
                detok = self.tokenizer.detokenize(pred)
                output.append(detok + '\n')

            elapsed = time.time() - translate_timer
            batch_time.update(elapsed, batch_size)

            total_tokens = stats['total_dec_len'] + stats['total_enc_len']
            ttps = total_tokens / elapsed
            tot_tok_per_sec.update(ttps, batch_size)

            iterations.update(stats['iters'])
            enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size)
            dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size)

            if i % self.print_freq == 0:
                log = []
                log += f'TEST '
                if epoch is not None:
                    log += f'[{epoch}]'
                if iteration is not None:
                    log += f'[{iteration}]'
                log += f'[{i}/{len(self.loader)}]\t'
                log += f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                log += f'Decoder iters {iterations.val:.1f} ({iterations.avg:.1f})\t'
                log += f'Tok/s {tot_tok_per_sec.val:.0f} ({tot_tok_per_sec.avg:.0f})'
                log = ''.join(log)
                logging.info(log)

        tot_tok_per_sec.reduce('sum')
        enc_seq_len.reduce('mean')
        dec_seq_len.reduce('mean')
        batch_time.reduce('mean')
        iterations.reduce('sum')

        if summary and get_rank() == 0:
            time_per_sentence = (batch_time.avg / global_batch_size)
            log = []
            log += f'TEST SUMMARY:\n'
            log += f'Lines translated: {len(self.loader.dataset)}\t'
            log += f'Avg total tokens/s: {tot_tok_per_sec.avg:.0f}\n'
            log += f'Avg time per batch: {batch_time.avg:.3f} s\t'
            log += f'Avg time per sentence: {1000*time_per_sentence:.3f} ms\n'
            log += f'Avg encoder seq len: {enc_seq_len.avg:.2f}\t'
            log += f'Avg decoder seq len: {dec_seq_len.avg:.2f}\t'
            log += f'Total decoder iterations: {int(iterations.sum)}'
            log = ''.join(log)
            logging.info(log)

        return output
Пример #14
0
def main():
    """
    Launches data-parallel multi-gpu training.
    """
    mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__))
    mlperf_log.LOGGER.propagate = False

    args = parse_args()
    device = utils.set_device(args.cuda, args.local_rank)
    distributed = utils.init_distributed(args.cuda)
    gnmt_print(key=mlperf_log.RUN_START, sync=True)
    args.rank = utils.get_rank()

    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # create directory for results
    save_path = os.path.join(args.results_dir, args.save)
    args.save_path = save_path
    os.makedirs(save_path, exist_ok=True)

    # setup logging
    log_filename = f'log_rank_{utils.get_rank()}.log'
    utils.setup_logging(os.path.join(save_path, log_filename))

    if args.env:
        utils.log_env_info()

    logging.info(f'Saving results to: {save_path}')
    logging.info(f'Run arguments: {args}')

    # automatically set train_iter_size based on train_global_batch_size,
    # world_size and per-worker train_batch_size
    if args.train_global_batch_size is not None:
        global_bs = args.train_global_batch_size
        bs = args.train_batch_size
        world_size = utils.get_world_size()
        assert global_bs % (bs * world_size) == 0
        args.train_iter_size = global_bs // (bs * world_size)
        logging.info(f'Global batch size was set in the config, '
                     f'Setting train_iter_size to {args.train_iter_size}')

    worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs,
                                                      device)
    worker_seed = worker_seeds[args.rank]
    logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}')
    torch.manual_seed(worker_seed)

    # build tokenizer
    pad_vocab = utils.pad_vocabulary(args.math)
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME),
                          pad_vocab)

    # build datasets
    gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING, sync=False)
    gnmt_print(key=mlperf_log.TRAIN_HP_MAX_SEQ_LEN,
               value=args.max_length_train,
               sync=False)

    train_data = LazyParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_train,
        max_len=args.max_length_train,
        sort=False,
        max_size=args.max_size)

    gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES,
               value=len(train_data),
               sync=False)

    val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir,
                                                      config.SRC_VAL_FNAME),
                               tgt_fname=os.path.join(args.dataset_dir,
                                                      config.TGT_VAL_FNAME),
                               tokenizer=tokenizer,
                               min_len=args.min_length_val,
                               max_len=args.max_length_val,
                               sort=True)

    gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL, sync=False)

    test_data = TextDataset(src_fname=os.path.join(args.dataset_dir,
                                                   config.SRC_TEST_FNAME),
                            tokenizer=tokenizer,
                            min_len=args.min_length_test,
                            max_len=args.max_length_test,
                            sort=True)

    gnmt_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES,
               value=len(test_data),
               sync=False)

    vocab_size = tokenizer.vocab_size
    gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size, sync=False)

    # build GNMT model
    model_config = {
        'hidden_size': args.hidden_size,
        'num_layers': args.num_layers,
        'dropout': args.dropout,
        'batch_first': False,
        'share_embedding': args.share_embedding
    }
    model = GNMT(vocab_size=vocab_size, **model_config)
    logging.info(model)

    batch_first = model.batch_first

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD, args.smoothing)

    opt_config = {'optimizer': args.optimizer, 'lr': args.lr}
    opt_config.update(literal_eval(args.optimizer_extra))
    logging.info(f'Training optimizer config: {opt_config}')

    scheduler_config = {
        'warmup_steps': args.warmup_steps,
        'remain_steps': args.remain_steps,
        'decay_interval': args.decay_interval,
        'decay_steps': args.decay_steps,
        'decay_factor': args.decay_factor
    }

    logging.info(f'Training LR schedule config: {scheduler_config}')

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info(f'Number of parameters: {num_parameters}')

    batching_opt = {
        'shard_size': args.shard_size,
        'num_buckets': args.num_buckets
    }
    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.train_batch_size,
                                         seeds=shuffling_seeds,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         batching=args.batching,
                                         batching_opt=batching_opt,
                                         num_workers=args.train_loader_workers)

    gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE,
               value=args.train_batch_size * utils.get_world_size(),
               sync=False)
    gnmt_print(key=mlperf_log.INPUT_SIZE,
               value=train_loader.sampler.num_samples,
               sync=False)

    val_loader = val_data.get_loader(batch_size=args.val_batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     num_workers=args.val_loader_workers)

    test_loader = test_data.get_loader(batch_size=args.test_batch_size,
                                       batch_first=batch_first,
                                       shuffle=False,
                                       pad=True,
                                       num_workers=args.test_loader_workers)

    gnmt_print(key=mlperf_log.EVAL_SIZE,
               value=len(test_loader.dataset),
               sync=False)

    translator = Translator(model=model,
                            tokenizer=tokenizer,
                            loader=test_loader,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_length_test,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda,
                            print_freq=args.print_freq,
                            dataset_dir=args.dataset_dir,
                            target_bleu=args.target_bleu,
                            save_path=args.save_path)

    # create trainer
    total_train_iters = len(train_loader) // args.train_iter_size * args.epochs
    save_info = {
        'model_config': model_config,
        'config': args,
        'tokenizer': tokenizer.get_state()
    }
    trainer_options = dict(criterion=criterion,
                           grad_clip=args.grad_clip,
                           iter_size=args.train_iter_size,
                           save_path=save_path,
                           save_freq=args.save_freq,
                           save_info=save_info,
                           opt_config=opt_config,
                           scheduler_config=scheduler_config,
                           train_iterations=total_train_iters,
                           batch_first=batch_first,
                           keep_checkpoints=args.keep_checkpoints,
                           math=args.math,
                           print_freq=args.print_freq,
                           cuda=args.cuda,
                           distributed=distributed,
                           intra_epoch_eval=args.intra_epoch_eval,
                           translator=translator)

    trainer_options['model'] = model
    trainer = trainers.Seq2SeqTrainer(**trainer_options)

    # optionally resume from a checkpoint
    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error(f'No checkpoint found at {args.resume}')

    # training loop
    best_loss = float('inf')
    break_training = False
    test_bleu = None
    gnmt_print(key=mlperf_log.TRAIN_LOOP, sync=True)
    for epoch in range(args.start_epoch, args.epochs):
        logging.info(f'Starting epoch {epoch}')
        gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=epoch, sync=True)

        train_loader.sampler.set_epoch(epoch)

        trainer.epoch = epoch
        train_loss, train_perf = trainer.optimize(train_loader)

        # evaluate on validation set
        if args.eval:
            logging.info(f'Running validation on dev set')
            val_loss, val_perf = trainer.evaluate(val_loader)

            # remember best prec@1 and save checkpoint
            gnmt_print(key=mlperf_log.TRAIN_CHECKPOINT, sync=False)
            if args.rank == 0:
                is_best = val_loss < best_loss
                best_loss = min(val_loss, best_loss)
                trainer.save(save_all=args.save_all, is_best=is_best)

        if args.eval:
            gnmt_print(key=mlperf_log.EVAL_START, value=epoch, sync=True)
            test_bleu, break_training = translator.run(calc_bleu=True,
                                                       epoch=epoch)
            gnmt_print(key=mlperf_log.EVAL_ACCURACY,
                       value={
                           "epoch": epoch,
                           "value": round(test_bleu, 2)
                       },
                       sync=False)
            gnmt_print(key=mlperf_log.EVAL_TARGET,
                       value=args.target_bleu,
                       sync=False)
            gnmt_print(key=mlperf_log.EVAL_STOP, sync=True)

        acc_log = []
        acc_log += [f'Summary: Epoch: {epoch}']
        acc_log += [f'Training Loss: {train_loss:.4f}']
        if args.eval:
            acc_log += [f'Validation Loss: {val_loss:.4f}']
            acc_log += [f'Test BLEU: {test_bleu:.2f}']

        perf_log = []
        perf_log += [f'Performance: Epoch: {epoch}']
        perf_log += [f'Training: {train_perf:.0f} Tok/s']
        if args.eval:
            perf_log += [f'Validation: {val_perf:.0f} Tok/s']

        if args.rank == 0:
            logging.info('\t'.join(acc_log))
            logging.info('\t'.join(perf_log))

        logging.info(f'Finished epoch {epoch}')
        if break_training:
            break

    gnmt_print(key=mlperf_log.RUN_STOP,
               value={"success": bool(break_training)},
               sync=True)
    gnmt_print(key=mlperf_log.RUN_FINAL, sync=False)
Пример #15
0
def main():
    """
    Launches data-parallel multi-gpu training.
    """

    mlperf_compliance.mlperf_log.LOGGER.propagate = False

    mlperf_compliance.mlperf_log.setdefault(
        root_dir=os.path.dirname(os.path.abspath(__file__)),
        benchmark=mlperf_compliance.constants.GNMT,
        stack_offset=1,
        extra_print=False
        )

    mlperf_print(key=mlperf_compliance.constants.INIT_START,
                 log_all_ranks=True)

    args = parse_args()
    device = utils.set_device(args.cuda, args.local_rank)
    distributed = utils.init_distributed(args.cuda)

    # preinit and warmup streams/ groups for apex DDP communicators
    allreduce_communicators=None
    if distributed and args.apex_num_allreduce_streams > 1:
        bucket_pgs = [torch.distributed.new_group() for _ in range(args.apex_num_allreduce_streams)]
        bucket_streams = [torch.cuda.Stream() for _ in range(args.apex_num_allreduce_streams)]
        for pg, stream in zip(bucket_pgs,bucket_streams):
            with torch.cuda.stream(stream):
                torch.distributed.all_reduce(torch.cuda.FloatTensor(1), group=pg)
        allreduce_communicators=(bucket_pgs,bucket_streams)

    args.rank = utils.get_rank()

    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # create directory for results
    save_path = os.path.join(args.results_dir, args.save)
    args.save_path = save_path
    os.makedirs(save_path, exist_ok=True)

    # setup logging
    log_filename = f'log_rank_{utils.get_rank()}.log'
    utils.setup_logging(args.log_all_ranks,
                        os.path.join(save_path, log_filename))

    if args.env:
        utils.log_env_info()

    logging.info(f'Saving results to: {save_path}')
    logging.info(f'Run arguments: {args}')

    # automatically set train_iter_size based on train_global_batch_size,
    # world_size and per-worker train_batch_size
    if args.train_global_batch_size is not None:
        global_bs = args.train_global_batch_size
        bs = args.train_batch_size
        world_size = utils.get_world_size()
        assert global_bs % (bs * world_size) == 0
        args.train_iter_size = global_bs // (bs * world_size)
        logging.info(f'Global batch size was set in the config, '
                     f'Setting train_iter_size to {args.train_iter_size}')
    # setup L2 promotion
    if args.cuda:
        utils.l2_promote()

    worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs,
                                                      device)
    worker_seed = worker_seeds[args.rank]
    logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}')
    torch.manual_seed(worker_seed)

    # build tokenizer
    # https://github.com/mlperf/policies/issues/201
    pad_vocab = utils.pad_vocabulary(args.math)
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME),
                          pad_vocab)

    vocab_size = tokenizer.vocab_size

    # build GNMT model
    model_config = {'hidden_size': args.hidden_size,
                    'num_layers': args.num_layers,
                    'dropout': args.dropout, 'batch_first': False,
                    'share_embedding': args.share_embedding,
                    'fusion': args.fused_attention}
    model = GNMT(vocab_size=vocab_size, **model_config)
    logging.info(model)

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD, args.smoothing,
                                args.fused_xentropy)

    opt_config = {'optimizer': args.optimizer, 'lr': args.lr}
    opt_config.update(literal_eval(args.optimizer_extra))
    logging.info(f'Training optimizer config: {opt_config}')

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info(f'Number of parameters: {num_parameters}')

    # create trainer
    save_info = {'model_config': model_config, 'config': args, 'tokenizer':
                 tokenizer.get_state()}
    loss_scaling = {'init_scale': args.init_scale, 'upscale_interval':
                    args.upscale_interval}
    trainer_options = dict(
        criterion=criterion,
        grad_clip=args.grad_clip,
        iter_size=args.train_iter_size,
        save_path=save_path,
        save_freq=args.save_freq,
        save_info=save_info,
        opt_config=opt_config,
        batch_first=model.batch_first,
        keep_checkpoints=args.keep_checkpoints,
        math=args.math,
        loss_scaling=loss_scaling,
        print_freq=args.print_freq,
        cuda=args.cuda,
        distributed=distributed,
        distributed_overlap_allreduce=args.enable_apex_allreduce_overlap,
        distributed_overlap_num_allreduce_streams=args.apex_num_allreduce_streams,
        distributed_overlap_allreduce_messagesize=args.apex_message_size,
        distributed_overlap_allreduce_communicators=allreduce_communicators,
        intra_epoch_eval=args.intra_epoch_eval,
        prealloc_mode=args.prealloc_mode)

    trainer_options['model'] = model
    trainer = trainers.Seq2SeqTrainer(**trainer_options)

    trainer.preallocate(args.train_batch_size, args.max_length_train,
                        training=True)

    mlperf_print(key=mlperf_compliance.constants.INIT_STOP,
                 sync=True)
    mlperf_print(key=mlperf_compliance.constants.RUN_START,
                 sync=True)
    utils.barrier()

    mlperf_print(key=mlperf_compliance.constants.MAX_SEQUENCE_LENGTH,
                 value=args.max_length_train,
                 metadata={'method': 'discard'})

    if args.use_preproc_data:
        train_data = PreprocessedDataset(
            min_len=args.min_length_train,
            max_len=args.max_length_train,
            vocab_size=tokenizer.vocab_size,
            )
        train_data.read_data(
            os.path.join(args.preproc_data_dir, 'training.bin'),
            tokenizer.vocab_size,
            )
        train_data.prepare()
    else:
        train_data = LazyParallelDataset(
            src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
            tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
            tokenizer=tokenizer,
            min_len=args.min_length_train,
            max_len=args.max_length_train,
            sort=False,
            max_size=args.max_size,
            )

    test_data = TextDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_test,
        max_len=args.max_length_test,
        sort=True)

    batching_opt = {'shard_size': args.shard_size,
                    'num_buckets': args.num_buckets}

    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.train_batch_size,
                                         seeds=shuffling_seeds,
                                         batch_first=model.batch_first,
                                         shuffle=True,
                                         batching=args.batching,
                                         batching_opt=batching_opt,
                                         num_workers=args.train_loader_workers)

    mlperf_print(key=mlperf_compliance.constants.GLOBAL_BATCH_SIZE,
                 value=args.train_batch_size * utils.get_world_size(),
                 sync=False)

    test_loader = test_data.get_loader(batch_size=args.test_batch_size,
                                       batch_first=model.batch_first,
                                       shuffle=False,
                                       num_workers=args.test_loader_workers)

    translator = Translator(model=model,
                            tokenizer=tokenizer,
                            loader=test_loader,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_length_test,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda,
                            print_freq=args.print_freq,
                            dataset_dir=args.dataset_dir,
                            target_bleu=args.target_bleu,
                            save_path=args.save_path)

    total_train_iters = len(train_loader) // args.train_iter_size * args.epochs

    scheduler_config = {'warmup_steps': args.warmup_steps,
                        'remain_steps': args.remain_steps,
                        'decay_interval': args.decay_interval,
                        'decay_steps': args.decay_steps,
                        'decay_factor': args.decay_factor}

    logging.info(f'Training LR schedule config: {scheduler_config}')
    scheduler = WarmupMultiStepLR(trainer.optimizer, total_train_iters,
                                  **scheduler_config)
    trainer.scheduler = scheduler
    trainer.translator = translator

    # optionally resume from a checkpoint
    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(
                checkpoint_file, 'model_best.pth')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error(f'No checkpoint found at {args.resume}')

    # training loop
    break_training = False
    test_bleu = None
    for epoch in range(args.start_epoch, args.epochs):
        mlperf_print(key=mlperf_compliance.constants.BLOCK_START,
                     metadata={'first_epoch_num': epoch + 1,
                               'epoch_count': 1},
                     sync=True)
        mlperf_print(key=mlperf_compliance.constants.EPOCH_START,
                     metadata={'epoch_num': epoch + 1},
                     sync=True)

        logging.info(f'Starting epoch {epoch}')
        train_loader.sampler.set_epoch(epoch)

        trainer.epoch = epoch
        train_loss, train_perf = trainer.optimize(train_loader)

        mlperf_print(key=mlperf_compliance.constants.EPOCH_STOP,
                     metadata={'epoch_num': epoch + 1},
                     sync=True)

        if args.eval:
            mlperf_print(key=mlperf_compliance.constants.EVAL_START,
                         metadata={'epoch_num': epoch + 1},
                         sync=True)
            test_bleu, break_training = translator.run(calc_bleu=True,
                                                       epoch=epoch)
            mlperf_print(key=mlperf_compliance.constants.EVAL_ACCURACY,
                         value=test_bleu,
                         metadata={'epoch_num': epoch + 1},
                         sync=False)
            mlperf_print(key=mlperf_compliance.constants.EVAL_STOP,
                         metadata={'epoch_num': epoch + 1},
                         sync=True)

        acc_log = []
        acc_log += [f'Summary: Epoch: {epoch}']
        acc_log += [f'Training Loss: {train_loss:.4f}']
        if args.eval:
            acc_log += [f'Test BLEU: {test_bleu:.2f}']

        perf_log = []
        perf_log += [f'Performance: Epoch: {epoch}']
        perf_log += [f'Training: {train_perf:.0f} Tok/s']

        if args.rank == 0:
            logging.info('\t'.join(acc_log))
            logging.info('\t'.join(perf_log))

        logging.info(f'Finished epoch {epoch}')
        mlperf_print(key=mlperf_compliance.constants.BLOCK_STOP,
                     metadata={'first_epoch_num': epoch + 1},
                     sync=True)

        if break_training:
            break

    if args.use_preproc_data:
        train_data.finalize()

    status = 'success' if break_training else 'aborted'
    mlperf_print(key=mlperf_compliance.constants.RUN_STOP,
                 metadata={'status': status},
                 sync=True)