Пример #1
0
def setup_logging(args, local_rank):
    """
    Setup logging configuration as well as random seed
    """
    logging_config(args.output_dir,
                   name='finetune_squad{}'.format(args.version),# avoid race
                   overwrite_handler=True,
                   console=(local_rank == 0))
    logging.info(args)
    set_seed(args.seed)
    logging.debug('Random seed set to {}'.format(args.seed))
Пример #2
0
def test_logging_config():
    logger = logging.getLogger(__name__)
    with tempfile.TemporaryDirectory() as root:
        logging_config(folder=root, logger=logger, name='test')
        file_names = os.listdir(root)
        assert file_names[0] == 'test.log'
        file_size = Path(os.path.join(root, 'test.log')).stat().st_size
        assert file_size == 0
        logger.info('123')
        for handler in logger.handlers:
            handler.flush()
        file_size_test1 = Path(os.path.join(root, 'test.log')).stat().st_size
        assert file_size_test1 > 0
        logging_config(folder=root, logger=logger, name='foo', overwrite_handler=False)
        logger.info('123')
        for handler in logger.handlers:
            handler.flush()
        file_size_test2 = Path(os.path.join(root, 'test.log')).stat().st_size
        file_size_foo1 = Path(os.path.join(root, 'foo.log')).stat().st_size
        assert file_size_test2 > file_size_test1
        assert file_size_foo1 > 0

        # After overwrite, the old hanlder will be removed
        logging_config(folder=root, logger=logger, name='zoo', overwrite_handler=True)
        logger.info('12345')
        for handler in logger.handlers:
            handler.flush()
        file_size_zoo1 = Path(os.path.join(root, 'zoo.log')).stat().st_size
        file_size_test3 = Path(os.path.join(root, 'test.log')).stat().st_size
        file_size_foo2 = Path(os.path.join(root, 'foo.log')).stat().st_size
        assert file_size_test3 == file_size_test2
        assert file_size_foo2 == file_size_foo1
        assert file_size_zoo1 > 0
Пример #3
0
def train(args):
    _, num_parts, rank, local_rank, _, ctx_l = init_comm(
        args.comm_backend, args.gpus)
    if args.comm_backend == 'horovod':
        logging_config(
            args.save_dir,
            name=f'train_transformer_rank{rank}_local{local_rank}_{num_parts}',
            console=(rank == 0))
        logging.info(args)
    else:
        logging_config(args.save_dir, name='train_transformer', console=True)
        logging.info(args)
    use_amp = args.fp16
    if use_amp:
        from mxnet import amp
    src_tokenizer = create_tokenizer(args.src_tokenizer,
                                     args.src_subword_model_path,
                                     args.src_vocab_path)
    tgt_tokenizer = create_tokenizer(args.tgt_tokenizer,
                                     args.tgt_subword_model_path,
                                     args.tgt_vocab_path)
    base_tgt_tokenizer = MosesTokenizer(args.tgt_lang)
    src_vocab = src_tokenizer.vocab
    tgt_vocab = tgt_tokenizer.vocab
    train_src_data, train_tgt_data = load_dataset_with_cache(
        args.train_src_corpus,
        args.train_tgt_corpus,
        src_tokenizer,
        tgt_tokenizer,
        args.overwrite_cache,
        local_rank,
        max_src_length=args.max_src_length,
        max_tgt_length=args.max_tgt_length,
        pretokenized=not args.tokenize)
    dev_src_data, dev_tgt_data = load_dataset_with_cache(
        args.dev_src_corpus,
        args.dev_tgt_corpus,
        src_tokenizer,
        tgt_tokenizer,
        args.overwrite_cache,
        local_rank,
        pretokenized=not args.tokenize)
    tgt_detok_sentences = []
    tgt_raw_sentences = []
    with open(args.dev_tgt_corpus, 'r') as in_f:
        for line in in_f:
            tgt_detok_sentences.append(
                base_tgt_tokenizer.decode(
                    tgt_tokenizer.decode(line.split()).split()))
    with open(args.dev_tgt_raw_corpus, 'r') as in_f:
        for line in in_f:
            tgt_raw_sentences.append(line.strip())
    data_train = gluon.data.SimpleDataset([
        (src_tokens, tgt_tokens, len(src_tokens), len(tgt_tokens), i)
        for i, (src_tokens,
                tgt_tokens) in enumerate(zip(train_src_data, train_tgt_data))
    ])
    val_samples = [
        (src_tokens, tgt_tokens, len(src_tokens), len(tgt_tokens), i)
        for i, (src_tokens,
                tgt_tokens) in enumerate(zip(dev_src_data, dev_tgt_data))
    ]
    if args.comm_backend == 'horovod':
        slice_begin = rank * (len(val_samples) // num_parts)
        slice_end = min((rank + 1) * (len(val_samples) // num_parts),
                        len(val_samples))
        data_val = gluon.data.SimpleDataset(val_samples[slice_begin:slice_end])
    else:
        data_val = gluon.data.SimpleDataset(val_samples)
    # Construct the model + loss function
    if args.cfg.endswith('.yml'):
        cfg = TransformerModel.get_cfg().clone_merge(args.cfg)
    else:
        cfg = TransformerModel.get_cfg(args.cfg)
    cfg.defrost()
    cfg.MODEL.src_vocab_size = len(src_vocab)
    cfg.MODEL.tgt_vocab_size = len(tgt_vocab)
    cfg.MODEL.layout = 'TN'
    cfg.freeze()
    model = TransformerModel.from_cfg(cfg)
    model.initialize(mx.init.Xavier(magnitude=args.magnitude), ctx=ctx_l)
    model.hybridize()
    for v in model.collect_params().values():
        if v.grad_req != 'null':
            v.grad_req = 'add'
    # Do not apply weight decay to all the LayerNorm and bias
    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    param_dict = deduplicate_param_dict(model.collect_params())

    inference_model = TransformerInference(model=model)
    inference_model.hybridize()
    if local_rank == 0:
        logging.info(model)
    with open(os.path.join(args.save_dir, 'config.yml'), 'w') as cfg_f:
        cfg_f.write(cfg.dump())
    label_smooth_loss = LabelSmoothCrossEntropyLoss(
        num_labels=len(tgt_vocab),
        alpha=args.label_smooth_alpha,
        from_logits=False)
    label_smooth_loss.hybridize()

    # Construct the beam search sampler
    scorer = BeamSearchScorer(alpha=args.lp_alpha,
                              K=args.lp_k,
                              from_logits=False)
    beam_search_sampler = BeamSearchSampler(beam_size=args.beam_size,
                                            decoder=inference_model,
                                            vocab_size=len(tgt_vocab),
                                            eos_id=tgt_vocab.eos_id,
                                            scorer=scorer,
                                            stochastic=False,
                                            max_length_a=args.max_length_a,
                                            max_length_b=args.max_length_b)

    logging.info(beam_search_sampler)
    if args.comm_backend == 'horovod':
        hvd.broadcast_parameters(param_dict, root_rank=0)

    # Construct the trainer
    if args.lr is None:
        base_lr = 2.0 / math.sqrt(args.num_units) / math.sqrt(
            args.warmup_steps)
    else:
        base_lr = args.lr
    lr_scheduler = InverseSquareRootScheduler(
        warmup_steps=args.warmup_steps,
        base_lr=base_lr,
        warmup_init_lr=args.warmup_init_lr)
    optimizer_params = {
        'learning_rate': args.lr,
        'beta1': 0.9,
        'beta2': 0.997,
        'epsilon': 1e-9,
        'lr_scheduler': lr_scheduler,
        'wd': args.wd
    }
    user_provided_ptimizer_params = json.loads(args.optimizer_params)
    optimizer_params.update(user_provided_ptimizer_params)

    if args.fp16:
        optimizer_params.update({'multi_precision': True})
    if args.comm_backend == 'horovod':
        trainer = hvd.DistributedTrainer(param_dict, args.optimizer,
                                         optimizer_params)
    else:
        trainer = gluon.Trainer(param_dict,
                                args.optimizer,
                                optimizer_params,
                                update_on_kvstore=False)
    # Load Data
    if args.sampler == 'BoundedBudgetSampler':
        train_batch_sampler = BoundedBudgetSampler(
            lengths=[(ele[2], ele[3]) for ele in data_train],
            max_num_tokens=args.max_num_tokens,
            max_num_sentences=args.max_num_sentences,
            shuffle=True,
            seed=args.seed)
    elif args.sampler == 'FixedBucketSampler':
        if args.comm_backend == 'horovod':
            raise NotImplementedError(
                'FixedBucketSampler does not support horovod at present')

        if args.bucket_scheme == 'constant':
            bucket_scheme = ConstWidthBucket()
        elif args.bucket_scheme == 'linear':
            bucket_scheme = LinearWidthBucket()
        elif args.bucket_scheme == 'exp':
            bucket_scheme = ExpWidthBucket(bucket_len_step=1.2)
        else:
            raise NotImplementedError
        # TODO(sxjscience) Support auto-bucket-size tuning
        train_batch_sampler = FixedBucketSampler(lengths=[
            (ele[2], ele[3]) for ele in data_train
        ],
                                                 batch_size=args.batch_size,
                                                 num_buckets=args.num_buckets,
                                                 ratio=args.bucket_ratio,
                                                 shuffle=True,
                                                 use_average_length=True,
                                                 bucket_scheme=bucket_scheme,
                                                 seed=args.seed)
    else:
        raise NotImplementedError

    num_updates_per_epoch = int(
        math.ceil(
            len(train_batch_sampler) /
            (num_parts * len(ctx_l) * args.num_accumulated)))
    # Convert the batch sampler to multiple shards
    if num_parts > 1:
        train_batch_sampler = ShardedIterator(train_batch_sampler,
                                              num_parts=num_parts,
                                              part_index=rank,
                                              even_size=True,
                                              seed=args.seed + 1000 * rank)

    logging.info(train_batch_sampler)

    batchify_fn = bf.Tuple(bf.Pad(), bf.Pad(), bf.Stack(), bf.Stack(),
                           bf.Stack())
    train_data_loader = gluon.data.DataLoader(
        data_train,
        batch_sampler=train_batch_sampler,
        batchify_fn=batchify_fn,
        num_workers=0)
    val_data_loader = gluon.data.DataLoader(data_val,
                                            batch_size=args.val_batch_size,
                                            batchify_fn=batchify_fn,
                                            num_workers=0,
                                            shuffle=False)
    params = [p for p in param_dict.values() if p.grad_req != 'null']
    model_averager = AverageSGDTracker(param_dict)
    log_start_time = time.time()
    num_params, num_fixed_params = None, None

    # TODO(sxjscience) Add a log metric class
    log_avg_loss_l = [mx.np.array(0.0, ctx=ctx) for ctx in ctx_l]
    # Maintain the denominator of the loss.
    log_avg_loss_denom_l = [mx.np.array(0.0, ctx=ctx) for ctx in ctx_l]
    log_wc_l = [mx.np.array(0, dtype=np.int64, ctx=ctx) for ctx in ctx_l]
    log_tgt_wc_l = [mx.np.array(0, dtype=np.int64, ctx=ctx) for ctx in ctx_l]
    log_avg_grad_norm = 0
    log_iter_num = 0

    if local_rank == 0:
        writer = SummaryWriter(
            logdir=os.path.join(args.save_dir, 'tensorboard'))
    if use_amp:
        amp.init_trainer(trainer)
    train_multi_data_loader = grouper(repeat(train_data_loader), len(ctx_l))
    # when args.epochs < 0, the model will keep training
    if args.epochs < 0:
        if args.max_update > 0:
            total_train_iters = args.max_update
            if args.num_averages > 0:
                assert args.num_averages <= total_train_iters // args.save_iterval_update
                avg_start_iter = (
                    total_train_iters // args.save_iterval_update -
                    args.num_averages) * args.save_iterval_update
            else:
                avg_start_iter = -1
        else:
            total_train_iters = np.inf
            avg_start_iter = -1
    else:
        total_train_iters = args.epochs * num_updates_per_epoch
        if args.num_averages > 0:
            assert args.num_averages <= args.epochs
            avg_start_iter = (args.epochs -
                              args.num_average) * num_updates_per_epoch
        else:
            avg_start_iter = -1

    # Here, we are manually setting up the scale to 1.0 because
    # in horovod, the scale can be the number of workers:
    # See the code here: https://github.com/horovod/horovod/blob/125115583b7029196e2ec530decd4209459d5479/horovod/mxnet/__init__.py#L141
    # Since we will need to use the dynamic scaling in amp, we will manually call amp.unscale().
    # A scale that is larger than 1.0 can be problematic in this case.
    trainer._scale = 1.0
    if args.max_num_tokens > 0:
        const_scale = args.max_num_tokens
    else:
        const_scale = 100

    train_start_time = time.time()

    for train_iter in range(total_train_iters):
        model.zero_grad()
        loss_denom_l = [mx.np.array(0.0, ctx=ctx) for ctx in ctx_l]
        for i in range(args.num_accumulated):
            loss_l = []
            sample_data_l = next(train_multi_data_loader)
            for j, (sample_data, ctx) in enumerate(zip(sample_data_l, ctx_l)):
                src_token_ids, tgt_token_ids, src_valid_length,\
                tgt_valid_length, sample_ids = sample_data
                src_token_ids = src_token_ids.as_in_ctx(ctx)
                tgt_token_ids = tgt_token_ids.as_in_ctx(ctx)
                src_valid_length = src_valid_length.as_in_ctx(ctx)
                tgt_valid_length = tgt_valid_length.as_in_ctx(ctx)
                src_wc, tgt_wc, bs = src_valid_length.sum(), \
                                     tgt_valid_length.sum(), src_token_ids.shape[0]
                log_wc_l[j] += src_wc + tgt_wc
                log_tgt_wc_l[j] += tgt_wc
                token_count = (tgt_valid_length - 1).sum()
                loss_denom_l[j] += token_count / const_scale
                log_avg_loss_denom_l[j] += token_count / const_scale
                with mx.autograd.record():
                    if model.layout == 'NT':
                        tgt_pred = model(src_token_ids, src_valid_length,
                                         tgt_token_ids[:, :-1],
                                         tgt_valid_length - 1)
                        tgt_labels = tgt_token_ids[:, 1:]
                        loss = label_smooth_loss(tgt_pred, tgt_labels)
                        loss = mx.npx.sequence_mask(
                            loss,
                            sequence_length=tgt_valid_length - 1,
                            use_sequence_length=True,
                            axis=1)
                        loss = loss.sum() / const_scale
                        loss_l.append(loss)
                    elif model.layout == 'TN':
                        tgt_pred = model(src_token_ids.T, src_valid_length,
                                         tgt_token_ids.T[:-1, :],
                                         tgt_valid_length - 1)
                        tgt_labels = tgt_token_ids.T[1:, :]
                        loss = label_smooth_loss(tgt_pred, tgt_labels)
                        loss = mx.npx.sequence_mask(
                            loss,
                            sequence_length=tgt_valid_length - 1,
                            use_sequence_length=True,
                            axis=0)
                        loss = loss.sum() / const_scale
                        loss_l.append(loss)
                log_avg_loss_l[j] += loss
            if use_amp:
                with mx.autograd.record():
                    with amp.scale_loss(loss_l, trainer) as amp_loss_l:
                        for loss in amp_loss_l:
                            loss.backward()
            else:
                with mx.autograd.record():
                    for loss in loss_l:
                        loss.backward()

        # Print the total number of parameters
        if local_rank == 0 and num_params is None:
            num_params, num_fixed_params = count_parameters(param_dict)
            logging.info(
                'Total Number of Parameters (not-fixed/fixed): {}/{}'.format(
                    num_params, num_fixed_params))
        # All-Reduce the gradient
        trainer.allreduce_grads()
        if args.comm_backend == 'horovod':
            # All-Reduce the loss denominator
            assert len(loss_denom_l) == 1
            loss_denom = hvd.allreduce(loss_denom_l[0],
                                       average=False).asnumpy()
        else:
            loss_denom = sum([ele.asnumpy() for ele in loss_denom_l])
        if use_amp:
            # We need to first unscale the gradient and then perform allreduce.
            grad_scale = trainer.amp_loss_scale * loss_denom
        else:
            grad_scale = loss_denom
        if args.max_grad_norm is not None:
            total_norm, ratio, is_finite\
                = clip_grad_global_norm(params, args.max_grad_norm * grad_scale)
            total_norm = total_norm / grad_scale
        else:
            total_norm = grad_global_norm(params)
            total_norm = total_norm / grad_scale
        log_avg_grad_norm += total_norm
        log_iter_num += 1

        trainer.update(loss_denom, ignore_stale_grad=True)

        if avg_start_iter > 0 and train_iter >= avg_start_iter:
            model_averager.step()

        if ((train_iter + 1) % args.log_interval == 0
                or train_iter + 1 == total_train_iters):
            if args.comm_backend == 'horovod':
                # Use allreduce to get the total number of tokens and loss
                log_wc = hvd.allreduce(log_wc_l[0], average=False).asnumpy()
                log_tgt_wc = hvd.allreduce(log_tgt_wc_l[0],
                                           average=False).asnumpy()
                log_avg_loss = hvd.allreduce(log_avg_loss_l[0] /
                                             log_avg_loss_denom_l[0],
                                             average=True)
                log_avg_loss = log_avg_loss.asnumpy()
            else:
                log_wc = sum([ele.asnumpy() for ele in log_wc_l])
                log_tgt_wc = sum([ele.asnumpy() for ele in log_tgt_wc_l])
                log_avg_loss =\
                    sum([log_avg_loss_l[i].asnumpy() / log_avg_loss_denom_l[i].asnumpy()
                         for i in range(len(log_avg_loss_l))]) / len(log_avg_loss_l)
            log_avg_grad_norm = log_avg_grad_norm / log_iter_num
            log_end_time = time.time()
            wps = log_wc / (log_end_time - log_start_time)
            epoch_id = train_iter // num_updates_per_epoch
            logging.info(
                '[Epoch {} Iter {}/{}, Overall {}/{}] loss={:.4f}, ppl={:.4f}, '
                'throughput={:.2f}K wps, total wc={:.2f}K, wpb={:.2f}K,'
                ' LR={}, gnorm={:.4f}, ETA={:.2f}h'.format(
                    epoch_id, train_iter % num_updates_per_epoch + 1,
                    num_updates_per_epoch,
                    train_iter + 1, total_train_iters, log_avg_loss,
                    np.exp(log_avg_loss), wps / 1000, log_wc / 1000,
                    log_tgt_wc / 1000 / log_iter_num, trainer.learning_rate,
                    log_avg_grad_norm,
                    (log_end_time - train_start_time) / (train_iter + 1) *
                    (total_train_iters - train_iter - 1) / 3600))
            if local_rank == 0:
                writer.add_scalar('throughput_wps', wps, train_iter)
                writer.add_scalar('train_loss', log_avg_loss, train_iter)
                writer.add_scalar('lr', trainer.learning_rate, train_iter)
                writer.add_scalar('grad_norm', log_avg_grad_norm, train_iter)
            # Reinitialize the log variables
            log_start_time = time.time()
            log_avg_loss_l = [mx.np.array(0.0, ctx=ctx) for ctx in ctx_l]
            log_avg_loss_denom_l = [mx.np.array(0.0, ctx=ctx) for ctx in ctx_l]
            log_avg_grad_norm = 0
            log_iter_num = 0
            log_wc_l = [
                mx.np.array(0, dtype=np.int64, ctx=ctx) for ctx in ctx_l
            ]
            log_tgt_wc_l = [
                mx.np.array(0, dtype=np.int64, ctx=ctx) for ctx in ctx_l
            ]

        if (args.max_update > 0 and (train_iter + 1) % args.save_interval_update == 0) \
            or ((train_iter + 1) % num_updates_per_epoch == 0) \
            or train_iter + 1 == total_train_iters:
            epoch_id = (train_iter + 1) // num_updates_per_epoch
            if local_rank == 0:
                if args.max_update <= 0:
                    model.save_parameters(os.path.join(
                        args.save_dir, 'epoch{}.params'.format(epoch_id)),
                                          deduplicate=True)
                else:
                    model.save_parameters(os.path.join(
                        args.save_dir, 'iter{}.params'.format(train_iter + 1)),
                                          deduplicate=True)

            avg_val_loss, ntokens, pred_sentences, pred_lengths, sentence_ids\
                = validation(model, val_data_loader, inference_model, beam_search_sampler,
                             tgt_tokenizer, ctx_l)
            if args.comm_backend == 'horovod':
                flatten_pred_sentences = np.concatenate(pred_sentences, axis=0)
                all_val_loss = hvd.allgather(
                    mx.np.array([avg_val_loss * ntokens],
                                dtype=np.float32,
                                ctx=ctx_l[0]))
                all_ntokens = hvd.allgather(
                    mx.np.array([ntokens], dtype=np.int64, ctx=ctx_l[0]))
                flatten_pred_sentences = hvd.allgather(
                    mx.np.array(flatten_pred_sentences,
                                dtype=np.int32,
                                ctx=ctx_l[0]))
                pred_lengths = hvd.allgather(
                    mx.np.array(pred_lengths, dtype=np.int64, ctx=ctx_l[0]))
                sentence_ids = hvd.allgather(
                    mx.np.array(sentence_ids, dtype=np.int64, ctx=ctx_l[0]))
                avg_val_loss = all_val_loss.asnumpy().sum(
                ) / all_ntokens.asnumpy().sum()
                flatten_pred_sentences = flatten_pred_sentences.asnumpy()
                pred_lengths = pred_lengths.asnumpy()
                sentence_ids = sentence_ids.asnumpy()
                pred_sentences = [None for _ in range(len(sentence_ids))]
                ptr = 0
                assert sentence_ids.min() == 0 and sentence_ids.max(
                ) == len(sentence_ids) - 1
                for sentence_id, length in zip(sentence_ids, pred_lengths):
                    pred_sentences[sentence_id] = flatten_pred_sentences[ptr:(
                        ptr + length)]
                    ptr += length
            if local_rank == 0:
                # Perform detokenization
                pred_sentences_bpe_decode = []
                pred_sentences_raw = []
                for sentence in pred_sentences:
                    bpe_decode_sentence = tgt_tokenizer.decode(
                        sentence.tolist())
                    raw_sentence = base_tgt_tokenizer.decode(
                        bpe_decode_sentence.split())
                    pred_sentences_bpe_decode.append(bpe_decode_sentence)
                    pred_sentences_raw.append(raw_sentence)
                detok_sacrebleu_out = sacrebleu.corpus_bleu(
                    sys_stream=pred_sentences_bpe_decode,
                    ref_streams=[tgt_detok_sentences])
                raw_sacrebleu_out = sacrebleu.corpus_bleu(
                    sys_stream=pred_sentences_raw,
                    ref_streams=[tgt_raw_sentences])
                with open(
                        os.path.join(args.save_dir,
                                     f'epoch{epoch_id}_dev_prediction.txt'),
                        'w') as of:
                    for line in pred_sentences_raw:
                        of.write(line + '\n')
                logging.info(
                    '[Epoch {}][Iter {}/{}] validation loss/ppl={:.4f}/{:.4f}, '
                    'SacreBlEU={}, Detok SacreBLUE={}'.format(
                        epoch_id, train_iter, total_train_iters, avg_val_loss,
                        np.exp(avg_val_loss), raw_sacrebleu_out.score,
                        detok_sacrebleu_out.score))
                writer.add_scalar('valid_loss', avg_val_loss, train_iter)
                writer.add_scalar('valid_bleu', raw_sacrebleu_out.score,
                                  train_iter)

    if args.num_averages > 0:
        model_averager.copy_back(
            param_dict)  # TODO(sxjscience) Rewrite using update
        model.save_parameters(os.path.join(args.save_dir, 'average.params'),
                              deduplicate=True)
Пример #4
0
    with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of:
        of.write(gluon_cfg.dump())

    ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()

    gluon_xlmr = convert_params(fairseq_xlmr, gluon_cfg, ctx)
    if args.test:
        test_model(fairseq_xlmr, gluon_xlmr, args.gpu)

    gluon_xlmr.save_parameters(os.path.join(args.save_dir, 'model_mlm.params'),
                               deduplicate=True)
    logging.info('Convert the RoBERTa MLM model in {} to {}'.
                 format(os.path.join(args.fairseq_model_path, 'model.pt'), \
                        os.path.join(args.save_dir, 'model_mlm.params')))
    gluon_xlmr.backbone_model.save_parameters(os.path.join(
        args.save_dir, 'model.params'),
                                              deduplicate=True)
    logging.info('Convert the RoBERTa backbone model in {} to {}'.
                 format(os.path.join(args.fairseq_model_path, 'model.pt'), \
                        os.path.join(args.save_dir, 'model.params')))

    logging.info('Conversion finished!')
    logging.info('Statistics:')
    rename(args.save_dir)


if __name__ == '__main__':
    args = parse_args()
    logging_config()
    convert_fairseq_model(args)
Пример #5
0
def is_tf_available():
    return tensorflow is not None


def is_mxnet_available():
    return mxnet is not None


if platform.system() == "Windows":
    from signal import CTRL_C_EVENT as SIGKILL
else:
    from signal import SIGKILL


logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logging_config(folder='gluonnlp_benchmark', name='benchmark', logger=logger)


_is_memory_tracing_enabled = False

BenchmarkOutput = namedtuple(
    "BenchmarkOutput",
    [
        "inference_result",
        "train_result",
    ],
)


def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
    """
Пример #6
0
def is_tf_available():
    return tensorflow is not None


def is_mxnet_available():
    return mxnet is not None


if platform.system() == "Windows":
    from signal import CTRL_C_EVENT as SIGKILL
else:
    from signal import SIGKILL

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logging_config(logger=logger)

_is_memory_tracing_enabled = False

BenchmarkOutput = namedtuple(
    "BenchmarkOutput",
    [
        "inference_result",
        "train_result",
    ],
)


def separate_process_wrapper_fn(
        func: Callable[[],
                       None], do_multi_processing: bool) -> Callable[[], None]:
Пример #7
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='Transformer for Neural Machine Translation.')
    parser.add_argument('--train_src_corpus',
                        type=str,
                        help='The source training corpus.')
    parser.add_argument('--train_tgt_corpus',
                        type=str,
                        help='The target training corpus.')
    parser.add_argument('--dev_src_corpus',
                        type=str,
                        help='The source dev corpus.')
    parser.add_argument('--dev_tgt_corpus',
                        type=str,
                        help='The target dev corpus.')
    parser.add_argument(
        '--src_tokenizer',
        choices=[
            'spm', 'subword_nmt', 'yttm', 'hf_bytebpe', 'hf_wordpiece',
            'hf_bpe', 'whitespace'
        ],
        default='whitespace',
        type=str,
        help='The source tokenizer. '
        'Whitespace tokenizer supports processing pre-encoded corpus, '
        'and the tokenizers besides whitespace supports online encoding.')
    parser.add_argument('--tgt_tokenizer',
                        choices=[
                            'spm', 'subword_nmt', 'yttm', 'hf_bytebpe',
                            'hf_wordpiece', 'hf_bpe', 'whitespace'
                        ],
                        default='whitespace',
                        type=str,
                        help='The target tokenizer.')
    parser.add_argument('--src_subword_model_path',
                        type=str,
                        help='Path to the source subword model.')
    parser.add_argument('--src_vocab_path',
                        type=str,
                        help='Path to the source vocab.')
    parser.add_argument('--tgt_subword_model_path',
                        type=str,
                        help='Path to the target subword model.')
    parser.add_argument('--tgt_vocab_path',
                        type=str,
                        help='Path to the target vocab.')
    parser.add_argument('--seed',
                        type=int,
                        default=100,
                        help='The random seed.')
    parser.add_argument(
        '--epochs',
        type=int,
        default=30,
        help='Upper epoch limit, '
        'the model will keep training when epochs < 0 and max_update < 0.')
    parser.add_argument(
        '--max_update',
        type=int,
        default=-1,
        help='Max update steps, when max_update > 0, epochs will be set to -1, '
        'each update step contains gpu_num * num_accumulated batches.')
    parser.add_argument(
        '--save_interval_update',
        type=int,
        default=500,
        help='Update interval of saving checkpoints while using max_update.')
    parser.add_argument(
        '--cfg',
        type=str,
        default='transformer_base',
        help='Configuration of the transformer model. '
        'You may select a yml file or use the prebuild configurations.')
    parser.add_argument('--label_smooth_alpha',
                        type=float,
                        default=0.1,
                        help='Weight of label smoothing')
    parser.add_argument('--sampler',
                        type=str,
                        choices=['BoundedBudgetSampler', 'FixedBucketSampler'],
                        default='FixedBucketSampler',
                        help='Type of sampler')
    parser.add_argument(
        '--batch_size',
        type=int,
        default=2700,
        help='Batch size. Number of tokens per gpu in a minibatch.')
    parser.add_argument('--val_batch_size',
                        type=int,
                        default=16,
                        help='Batch size for evaluation.')
    parser.add_argument('--num_buckets',
                        type=int,
                        default=20,
                        help='Bucket number.')
    parser.add_argument(
        '--bucket_scheme',
        type=str,
        default='exp',
        help='Strategy for generating bucket keys. It supports: '
        '"constant": all the buckets have the same width; '
        '"linear": the width of bucket increases linearly; '
        '"exp": the width of bucket increases exponentially')
    parser.add_argument(
        '--bucket_ratio',
        type=float,
        default=0.0,
        help='Ratio for increasing the throughput of the bucketing')
    parser.add_argument(
        '--max_num_tokens',
        type=int,
        default=-1,
        help=
        'max tokens num of each batch, applicable while using BoundedBudgetSampler'
    )
    parser.add_argument(
        '--max_num_sentences',
        type=int,
        default=-1,
        help=
        'max sentences num of each batch, applicable while using BoundedBudgetSampler'
    )
    parser.add_argument(
        '--lr',
        type=float,
        default=0.002,
        help='The learning rate at the end of the warmup stage. '
        'If it is not given, we will use the formula suggested in the '
        'original Transformer paper:'
        ' 1.0 / sqrt(d_model) / sqrt(warmup_steps). '
        'Otherwise, we will use the given lr as the final learning rate in '
        'the warmup phase.')
    parser.add_argument(
        '--warmup_steps',
        type=int,
        default=4000,
        help='number of warmup steps used in NOAM\'s stepsize schedule')
    parser.add_argument(
        '--warmup_init_lr',
        type=float,
        default=0.0,
        help='Initial learning rate at the beginning of the warm-up stage')
    parser.add_argument(
        '--num_accumulated',
        type=int,
        default=32,
        help='Number of steps to accumulate the gradients. '
        'This is useful to mimic large batch training with limited gpu memory')
    parser.add_argument('--magnitude',
                        type=float,
                        default=3.0,
                        help='Magnitude of Xavier initialization')
    parser.add_argument('--num_averages',
                        type=int,
                        default=-1,
                        help='Perform final testing based on the '
                        'average of last num_averages checkpoints. '
                        'Use num_average will cause extra gpu memory usage.')
    parser.add_argument('--log_interval',
                        type=int,
                        default=10,
                        metavar='N',
                        help='report interval')
    parser.add_argument(
        '--save_dir',
        type=str,
        default='transformer_out',
        help='directory path to save the final model and training log')
    parser.add_argument('--overwrite_cache', action='store_true')
    parser.add_argument('--fp16',
                        action='store_true',
                        help='Whether to use dtype float16')
    parser.add_argument('--comm_backend',
                        type=str,
                        default='device',
                        choices=['horovod', 'dist_sync_device', 'device'],
                        help='Communication backend.')
    parser.add_argument(
        '--gpus',
        type=str,
        help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.')
    args = parser.parse_args()
    if args.max_update > 0:
        args.epochs = -1
    logging_config(args.save_dir, console=True)
    logging.info(args)
    return args
Пример #8
0
def train(args):
    _, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
        args.comm_backend, args.gpus)
    level = logging.DEBUG if args.verbose else logging.INFO
    logging_config(
        args.ckpt_dir,
        name='pretrain_bert_' + str(rank),  # avoid race
        level=level,
        console=(local_rank == 0))
    logging.info(args)
    logging.debug('Random seed set to {}'.format(args.seed))
    set_seed(args.seed)
    logging.info('Training info: num_buckets: {}, '
                 'num_workers: {}, rank: {}'.format(args.num_buckets,
                                                    num_workers, rank))
    cfg, tokenizer, model = get_pretraining_model(args.model_name, ctx_l)
    if args.start_step:
        logging.info('Restart training from {}'.format(args.start_step))
        parameters_option(args.start_step, model, args.ckpt_dir, 'Loading',
                          ctx_l)
    else:
        model.initialize(ctx=ctx_l)
    model.hybridize()

    if args.raw:
        get_dataset_fn = functools.partial(
            get_pretrain_data_text,
            max_seq_length=args.max_seq_length,
            short_seq_prob=args.short_seq_prob,
            masked_lm_prob=args.masked_lm_prob,
            max_predictions_per_seq=args.max_predictions_per_seq,
            whole_word_mask=args.whole_word_mask,
            random_next_sentence=args.random_next_sentence,
            tokenizer=tokenizer,
            circle_length=args.circle_length,
            repeat=args.repeat,
            dataset_cached=args.dataset_cached,
            num_max_dataset_cached=args.num_max_dataset_cached)
    else:
        get_dataset_fn = get_pretrain_data_npz

    data_train = get_dataset_fn(args.data,
                                args.batch_size,
                                shuffle=True,
                                num_buckets=args.num_buckets,
                                vocab=tokenizer.vocab,
                                num_parts=num_workers,
                                part_idx=rank,
                                num_dataset_workers=args.num_dataset_workers,
                                num_batch_workers=args.num_batch_workers)

    param_dict = model.collect_params()
    # Do not apply weight decay to all the LayerNorm and bias
    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    # Set grad_req if gradient accumulation is required
    params = [p for p in param_dict.values() if p.grad_req != 'null']
    num_accumulated = args.num_accumulated
    if num_accumulated > 1:
        logging.info(
            'Using gradient accumulation. Effective global batch size = {}'.
            format(num_accumulated * args.batch_size * len(ctx_l) *
                   num_workers))
        for p in params:
            p.grad_req = 'add'

    num_steps = args.num_steps
    warmup_steps = int(num_steps * args.warmup_ratio)
    log_interval = args.log_interval
    save_interval = args.ckpt_interval
    logging.info(
        '#Total Training Steps={}, Warmup Steps={}, Save Interval={}'.format(
            num_steps, warmup_steps, save_interval))
    optimizer_params = {'learning_rate': args.lr, 'wd': args.wd}
    if args.optimizer == 'adamw':
        optimizer_params.update({
            'beta1': 0.9,
            'beta2': 0.999,
            'epsilon': 1e-6,
            'correct_bias': False,
        })
    if args.comm_backend == 'horovod':
        trainer = hvd.DistributedTrainer(param_dict, args.optimizer,
                                         optimizer_params)
    elif args.comm_backend == 'byteps':
        trainer = bps.DistributedTrainer(param_dict, args.optimizer,
                                         optimizer_params)
    else:
        trainer = mx.gluon.Trainer(param_dict,
                                   args.optimizer,
                                   optimizer_params,
                                   update_on_kvstore=False)
    if args.start_step:
        logging.info('Restart training from {}'.format(args.start_step))
        states_option(args.start_step, trainer, args.ckpt_dir, local_rank,
                      'Loading')

    # backend specific implementation
    if args.comm_backend == 'byteps':
        trainer._init_params()
    if args.comm_backend == 'horovod':
        # Horovod: fetch and broadcast parameters
        hvd.broadcast_parameters(param_dict, root_rank=0)

    # prepare the loss function
    nsp_loss_fn = mx.gluon.loss.SoftmaxCELoss()
    mlm_loss_fn = mx.gluon.loss.SoftmaxCELoss()
    nsp_loss_fn.hybridize()
    mlm_loss_fn.hybridize()

    mlm_metric = MaskedAccuracy()
    nsp_metric = MaskedAccuracy()
    mlm_metric.reset()
    nsp_metric.reset()

    step_num = args.start_step
    if args.phase2:
        step_num -= args.phase1_num_steps

    running_mlm_loss, running_nsp_loss = 0., 0.
    running_num_tks = 0

    train_start_time = time.time()
    tic = time.time()
    # start training
    train_loop_dataloader = grouper(repeat(data_train), len(ctx_l))
    while step_num < num_steps:
        step_num += 1
        for _ in range(num_accumulated):
            sample_l = next(train_loop_dataloader)
            mlm_loss_l = []
            nsp_loss_l = []
            loss_l = []
            ns_label_list, ns_pred_list = [], []
            mask_label_list, mask_pred_list, mask_weight_list = [], [], []
            for sample, ctx in zip(sample_l, ctx_l):
                # prepare data
                (input_id, masked_id, masked_position, masked_weight, \
                    next_sentence_label, segment_id, valid_length) = sample
                input_id = input_id.as_in_ctx(ctx)
                masked_id = masked_id.as_in_ctx(ctx)
                masked_position = masked_position.as_in_ctx(ctx)
                masked_weight = masked_weight.as_in_ctx(ctx)
                next_sentence_label = next_sentence_label.as_in_ctx(ctx)
                segment_id = segment_id.as_in_ctx(ctx)
                valid_length = valid_length.as_in_ctx(ctx)

                with mx.autograd.record():
                    _, _, nsp_score, mlm_scores = model(
                        input_id, segment_id, valid_length, masked_position)
                    denominator = (masked_weight.sum() +
                                   1e-8) * num_accumulated * len(ctx_l)
                    mlm_scores_r = mx.npx.reshape(mlm_scores, (-5, -1))
                    masked_id_r = masked_id.reshape((-1, ))
                    mlm_loss = mlm_loss_fn(mlm_scores_r, masked_id_r,
                                           masked_weight.reshape(
                                               (-1, 1))).sum() / denominator
                    denominator = num_accumulated * len(ctx_l)
                    nsp_loss = nsp_loss_fn(
                        nsp_score, next_sentence_label).mean() / denominator
                    mlm_loss_l.append(mlm_loss)
                    nsp_loss_l.append(nsp_loss)
                    loss_l.append(mlm_loss + nsp_loss)
                    mask_label_list.append(masked_id_r)
                    mask_pred_list.append(mlm_scores_r)
                    mask_weight_list.append(masked_weight.reshape((-1, )))
                    ns_label_list.append(next_sentence_label)
                    ns_pred_list.append(nsp_score)

                running_num_tks += valid_length.sum().as_in_ctx(mx.cpu())

            for loss in loss_l:
                loss.backward()

            running_mlm_loss += sum([
                ele.as_in_ctx(mx.cpu()) for ele in mlm_loss_l
            ]).asnumpy().item()
            running_nsp_loss += sum([
                ele.as_in_ctx(mx.cpu()) for ele in nsp_loss_l
            ]).asnumpy().item()
            mlm_metric.update(mask_label_list, mask_pred_list,
                              mask_weight_list)
            nsp_metric.update(ns_label_list, ns_pred_list)
        # update
        trainer.allreduce_grads()

        total_norm, ratio, is_finite = clip_grad_global_norm(
            params, args.max_grad_norm * num_workers)
        total_norm = total_norm / num_workers

        # update learning rate
        scheduled_lr = args.lr
        if step_num <= warmup_steps:
            scheduled_lr *= step_num / warmup_steps
        else:
            offset = (num_steps - step_num) / (num_steps - warmup_steps)
            scheduled_lr *= max(offset, 0)
        trainer.set_learning_rate(scheduled_lr)

        if args.comm_backend == 'horovod' or args.comm_backend == 'byteps':
            # Note that horovod.trainer._scale is default to num_workers,
            # thus trainer.update(1) will scale the gradients by 1./num_workers.
            # *num_workers* of Horovod is the number of GPUs.
            trainer.update(1, ignore_stale_grad=True)
        else:
            # gluon.trainer._scale is default to 1.
            # *num_workers* of Trainer is the number of machines.
            trainer.update(num_workers, ignore_stale_grad=True)

        if num_accumulated > 1:
            # set grad to zero for gradient accumulation
            model.zero_grad()

        # saving
        if step_num % save_interval == 0 or step_num >= num_steps:
            states_option(step_num, trainer, args.ckpt_dir, local_rank,
                          'Saving')
            if local_rank == 0:
                parameters_option(step_num, model, args.ckpt_dir, 'Saving')
        # logging
        if step_num % log_interval == 0:
            running_mlm_loss /= log_interval
            running_nsp_loss /= log_interval
            toc = time.time()
            logging.info(
                '[step {}], Loss mlm/nsp={:.5f}/{:.3f}, Acc mlm/nsp={:.3f}/{:.3f}, '
                ' LR={:.7f}, grad_norm={:.4f}. Time cost={:.2f} s,'
                ' Throughput={:.1f}K tks/s, ETA={:.2f} h'.format(
                    step_num, running_mlm_loss, running_nsp_loss,
                    mlm_metric.get()[1],
                    nsp_metric.get()[1], trainer.learning_rate, total_norm,
                    toc - tic,
                    running_num_tks.asnumpy().item() / (toc - tic) / 1000,
                    (num_steps - step_num) /
                    (step_num / (toc - train_start_time)) / 3600))
            mlm_metric.reset()
            nsp_metric.reset()
            tic = time.time()

            running_mlm_loss = 0
            running_nsp_loss = 0
            running_num_tks = 0

    logging.info('Finish training step: %d', step_num)

    mx.npx.waitall()
    train_end_time = time.time()
    logging.info('Train cost={:.1f} s'.format(train_end_time -
                                              train_start_time))

    if local_rank == 0:
        model_name = args.model_name.replace('google', 'gluon')
        save_dir = os.path.join(args.ckpt_dir, model_name)
        final_save(model, save_dir, tokenizer, cfg)
Пример #9
0
def train(args):
    store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
        args.comm_backend, args.gpus)
    task = get_task(args.task_name)
    #setup_logging(args, local_rank)
    level = logging.INFO
    detail_dir = os.path.join(args.output_dir, args.task_name)
    if not os.path.exists(detail_dir):
        os.mkdir(detail_dir)
    logging_config(
        detail_dir,
        name='train_{}_{}_'.format(args.task_name, args.model_name) +
        str(rank),  # avoid race
        level=level,
        console=(local_rank == 0))
    logging.info(args)
    cfg, tokenizer, classify_net, use_segmentation = \
        get_network(args.model_name, ctx_l,
                    args.param_checkpoint,
                    args.backbone_path,
                    task)
    logging.info('Prepare training data')
    train_data, _ = get_task_data(args, tokenizer, segment='train', task=task)
    train_batchify = bf.Group(bf.Group(bf.Pad(), bf.Pad(), bf.Stack()),
                              bf.Stack())

    epoch_num_updates = len(train_data) // args.batch_size
    max_update = epoch_num_updates * args.epochs
    warmup_steps = int(np.ceil(max_update * args.warmup_ratio))

    dataloader = DataLoader(train_data,
                            batch_size=args.batch_size,
                            batchify_fn=train_batchify,
                            num_workers=4,
                            shuffle=True)
    dataloader = grouper(repeat(dataloader), len(ctx_l))

    param_dict = classify_net.collect_params()
    # Do not apply weight decay to all the LayerNorm and bias
    for _, v in classify_net.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    # Set grad_req if gradient accumulation is required
    params = [p for p in param_dict.values() if p.grad_req != 'null']
    num_accumulated = args.num_accumulated
    if num_accumulated > 1:
        logging.info(
            'Using gradient accumulation. Effective global batch size = {}'.
            format(num_accumulated * args.batch_size * len(ctx_l) *
                   num_workers))
        for p in params:
            p.grad_req = 'add'

    if args.comm_backend == 'horovod':
        # Horovod: fetch and broadcast parameters
        hvd.broadcast_parameters(param_dict, root_rank=0)

    lr_scheduler = PolyScheduler(max_update=max_update,
                                 base_lr=args.lr,
                                 warmup_begin_lr=0.0,
                                 pwr=1,
                                 final_lr=0.0,
                                 warmup_steps=warmup_steps,
                                 warmup_mode='linear')
    optimizer_params = {
        'learning_rate': args.lr,
        'wd': args.wd,
        'lr_scheduler': lr_scheduler
    }
    if args.comm_backend == 'horovod':
        trainer = hvd.DistributedTrainer(param_dict, args.optimizer,
                                         optimizer_params)
    else:
        trainer = mx.gluon.Trainer(classify_net.collect_params(), 'adamw',
                                   optimizer_params)

    if args.task_name == 'sts':
        loss_function = gluon.loss.L2Loss()
    else:
        loss_function = gluon.loss.SoftmaxCELoss()

    #prepare loss function
    log_loss = 0
    log_gnorm = 0
    log_step = 0
    if args.log_interval > 0:
        log_interval = args.log_interval
    else:
        log_interval = int(epoch_num_updates * 0.5)

    for i in range(max_update):
        sample_l = next(dataloader)
        loss_l = []
        for sample, ctx in zip(sample_l, ctx_l):
            (token_ids, token_types, valid_length), label = sample
            # Move to the corresponding context
            token_ids = mx.np.array(token_ids, ctx=ctx)
            token_types = mx.np.array(token_types, ctx=ctx)
            valid_length = mx.np.array(valid_length, ctx=ctx)
            label = mx.np.array(label, ctx=ctx)
            with mx.autograd.record():
                scores = classify_net(token_ids, token_types, valid_length)
                loss = loss_function(scores, label).mean() / len(ctx_l)
                loss_l.append(loss)
        for loss in loss_l:
            loss.backward()
        trainer.allreduce_grads()
        # Begin Norm Clipping
        total_norm, ratio, is_finite = clip_grad_global_norm(
            params, args.max_grad_norm)
        trainer.update(1.0)
        step_loss = sum([loss.asnumpy() for loss in loss_l])
        log_loss += step_loss
        log_gnorm += total_norm
        log_step += 1
        if log_step >= log_interval or i == max_update - 1:
            logging.info(
                '[Iter {} / {}] avg {} = {:.2f}, avg gradient norm = {:.2f}'.
                format(i + 1, max_update, 'nll', log_loss / log_step,
                       log_gnorm / log_step))
            log_loss = 0
            log_gnorm = 0
            log_step = 0
        if local_rank == 0 and (i == max_update - 1 or i %
                                (max_update // args.epochs) == 0 and i > 0):
            ckpt_name = '{}_{}_{}.params'.format(args.model_name,
                                                 args.task_name, (i + 1))

            params_saved = os.path.join(detail_dir, ckpt_name)
            classify_net.save_parameters(params_saved)
            logging.info('Params saved in: {}'.format(params_saved))
Пример #10
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='Transformer for Neural Machine Translation. Load a checkpoint and inference.')
    parser.add_argument('--seed', type=int, default=100, help='The random seed.')
    parser.add_argument('--src_lang', type=str, default='en', help='Source language')
    parser.add_argument('--tgt_lang', type=str, default='de', help='Target language')
    parser.add_argument('--src_corpus', type=str, required=True,
                        help='The source corpus for evaluation.')
    parser.add_argument('--tgt_corpus', type=str, default=None,
                        help='The target corpus for evaluation.')
    parser.add_argument('--src_normalizer', choices=['no', 'moses'],
                        default='moses', help='The sentence normalizer that will be '
                                              'used to normalize the source sentence.')
    parser.add_argument('--src_base_tokenizer', choices=['whitespace', 'moses',
                                                         'no'],
                        default='moses', help='The base tokenizer to tokenize the target '
                                              'sentence into a list of tokens.')
    parser.add_argument('--src_tokenizer', choices=['spm',
                                                    'subword_nmt',
                                                    'yttm',
                                                    'hf_bytebpe',
                                                    'hf_wordpiece',
                                                    'hf_bpe'],
                        required=True, type=str,
                        help='The source subword tokenizer. '
                             'Only supports online encoding at present.')
    parser.add_argument('--tgt_normalizer', choices=['no', 'moses'],
                        default='moses', help='The sentence normalizer that will be '
                                              'used to normalize the target sentence.')
    parser.add_argument('--tgt_base_tokenizer', choices=['whitespace', 'moses',
                                                         'no'],
                        default='moses', help='The base tokenizer to tokenize the source '
                                              'sentence into a list of tokens.')
    parser.add_argument('--tgt_tokenizer', choices=['spm',
                                                    'subword_nmt',
                                                    'yttm',
                                                    'hf_bytebpe',
                                                    'hf_wordpiece',
                                                    'hf_bpe'],
                        required=True, type=str,
                        help='The target tokenizer. Only supports online encoding at present.')    
    parser.add_argument('--src_subword_model_path', type=str,
                        help='Path to the source subword model.')
    parser.add_argument('--src_vocab_path', type=str,
                        help='Path to the source subword vocab.')
    parser.add_argument('--tgt_subword_model_path', type=str,
                        help='Path to the target subword model.')
    parser.add_argument('--tgt_vocab_path', type=str,
                        help='Path to the target subword vocab.')
    parser.add_argument('--src_max_len', type=int, default=None,
                        help='Maximum length of the source sentence.')
    parser.add_argument('--tgt_max_len', type=int, default=None,
                        help='Maximum length of the target sentence.')
    parser.add_argument('--cfg', type=str, help='Config file of the Transformer model.')
    parser.add_argument('--beam-size', type=int, default=4, help='Number of beams')
    parser.add_argument('--lp_alpha', type=float, default=0.6,
                        help='The alpha value in the length penalty')
    parser.add_argument('--lp_k', type=int, default=5, help='The K value in the length penalty')
    parser.add_argument('--max_length_a', type=int, default=1,
                        help='The a in the a * x + b formula of beam search')
    parser.add_argument('--max_length_b', type=int, default=50,
                        help='The b in the a * x + b formula of beam search')
    parser.add_argument('--param_path', type=str, help='The path to the model parameters.')
    parser.add_argument('--gpus', type=str, default='0',
                        help='List of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.'
                             '(using single gpu is suggested)')
    parser.add_argument('--save_dir', type=str, default=None,
                        help='The path to save the log files and predictions.')
    parser.add_argument('--stochastic', action='store_true',
                        help='Whether to use the stochastic beam search')
    parser.add_argument('--temperature', type=float, default=None,
                        help='the temperature used for softmax normalization with stochastic setting')
    parser.add_argument('--inference', action='store_true',
                        help='Whether to inference with your own data, '
                        'when applying inference, tgt_corpus is not needed and will be set to None.')
    parser.add_argument('--fp16', action='store_true',
                        help='Whether to use dtype float16')
    args = parser.parse_args()
    if args.save_dir is None:
        args.save_dir = os.path.splitext(args.param_path)[0] + '_evaluation'
    assert args.inference or args.tgt_corpus, 'requring --tgt_corpus while not using --inference'
    if args.inference:
        args.tgt_corpus = None
    if args.stochastic:
        if args.temperature is None:
            args.temperature = 0.5
    logging_config(args.save_dir, console=True)
    logging.info(args)
    return args
Пример #11
0
    rtd_preds = mx.np.round((mx.np.sign(rtd_scores) + 1) / 2).astype(np.int32)

    mlm_accuracy = accuracy(unmasked_tokens, mlm_preds, masked_weights)
    corrupted_mlm_accuracy = accuracy(unmasked_tokens, corrupted_tokens,
                                      masked_weights)
    rtd_accuracy = accuracy(rtd_labels, rtd_preds, length_masks)
    rtd_precision = accuracy(rtd_labels, rtd_preds, length_masks * rtd_preds)
    rtd_recall = accuracy(rtd_labels, rtd_preds, rtd_labels * rtd_preds)
    rtd_auc = auc(rtd_labels, rtd_probs, length_masks)
    writer.add_scalars(
        'results', {
            'mlm_accuracy': mlm_accuracy.asnumpy().item(),
            'corrupted_mlm_accuracy': corrupted_mlm_accuracy.asnumpy().item(),
            'rtd_accuracy': rtd_accuracy.asnumpy().item(),
            'rtd_precision': rtd_precision.asnumpy().item(),
            'rtd_recall': rtd_recall.asnumpy().item(),
            'rtd_auc': rtd_auc
        }, step_num)


if __name__ == '__main__':
    os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
    os.environ['MXNET_USE_FUSION'] = '0'  # Manually disable pointwise fusion
    args = parse_args()
    logging_config(args.output_dir, name='pretrain_owt')
    logging.debug('Random seed set to {}'.format(args.seed))
    logging.info(args)
    set_seed(args.seed)
    if args.do_train:
        train(args)
Пример #12
0
        ckpt_candidates = [args.param_checkpoint]
    else:
        ckpt_candidates = [f for f in os.listdir(args.output_dir) if f.endswith('.params')]
        ckpt_candidates.sort(key=lambda ele: (len(ele), ele))
    if last:
        ckpt_candidates = ckpt_candidates[-1:]

    best_eval = {}
    for ckpt_name in ckpt_candidates:
        logging.info('Starting evaluate the checkpoint {}'.format(ckpt_name))
        ckpt_path = os.path.join(args.output_dir, ckpt_name)
        qa_net.load_parameters(ckpt_path, ctx=ctx_l, cast_dtype=True)
        best_eval = eval_validation(ckpt_name, best_eval)

    logging.info('The best evaluated results are {}'.format(json.dumps(best_eval)))
    output_eval_results_file = os.path.join(args.output_dir, 'best_results.json')
    with open(output_eval_results_file, 'w') as of:
        of.write(json.dumps(best_eval, indent=4) + '\n')
    return best_eval


if __name__ == '__main__':
    os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
    args = parse_args()
    logging_config(args.output_dir, name='finetune_squad{}'.format(args.version))
    set_seed(args.seed)
    if args.do_train:
        train(args)
    if args.do_eval:
        evaluate(args, last=not args.all_evaluate)
Пример #13
0
def train(args):
    store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
        args.comm_backend, args.gpus)
    logging_config(
        args.output_dir,
        name='pretrain_owt_' + str(rank),  # avoid race
        console=(local_rank == 0))
    logging.info(args)
    logging.debug('Random seed set to {}'.format(args.seed))
    set_seed(args.seed)
    logging.info('Training info: num_buckets: {}, '
                 'num_workers: {}, rank: {}'.format(args.num_buckets,
                                                    num_workers, rank))
    cfg, tokenizer, model = get_pretraining_model(args.model_name, ctx_l,
                                                  args.max_seq_length,
                                                  args.hidden_dropout_prob,
                                                  args.attention_dropout_prob,
                                                  args.generator_units_scale,
                                                  args.generator_layers_scale)
    data_masker = ElectraMasker(tokenizer,
                                args.max_seq_length,
                                mask_prob=args.mask_prob,
                                replace_prob=args.replace_prob)
    if args.from_raw_text:
        if args.cached_file_path and not os.path.exists(args.cached_file_path):
            os.mkdir(args.cached_file_path)
        get_dataset_fn = functools.partial(
            get_pretrain_data_text,
            max_seq_length=args.max_seq_length,
            short_seq_prob=args.short_seq_prob,
            tokenizer=tokenizer,
            circle_length=args.circle_length,
            repeat=args.repeat,
            cached_file_path=args.cached_file_path)

        logging.info(
            'Processing and loading the training dataset from raw text.')

    else:
        logging.info('Loading the training dataset from local Numpy file.')
        get_dataset_fn = get_pretrain_data_npz

    data_train = get_dataset_fn(args.data,
                                args.batch_size,
                                shuffle=True,
                                num_buckets=args.num_buckets,
                                vocab=tokenizer.vocab,
                                num_parts=num_workers,
                                part_idx=rank,
                                num_dataset_workers=args.num_dataset_workers,
                                num_batch_workers=args.num_batch_workers)

    logging.info('Creating distributed trainer...')
    param_dict = model.collect_params()
    # Do not apply weight decay to all the LayerNorm and bias
    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    # Collect differentiable parameters
    params = [p for p in param_dict.values() if p.grad_req != 'null']
    # Set grad_req if gradient accumulation is required
    num_accumulated = args.num_accumulated
    if num_accumulated > 1:
        logging.info(
            'Using gradient accumulation. Effective global batch size = {}'.
            format(num_accumulated * args.batch_size * len(ctx_l) *
                   num_workers))
        for p in params:
            p.grad_req = 'add'
    # backend specific implementation
    if args.comm_backend == 'horovod':
        # Horovod: fetch and broadcast parameters
        hvd.broadcast_parameters(param_dict, root_rank=0)

    num_train_steps = args.num_train_steps
    if args.warmup_steps is not None:
        warmup_steps = args.warmup_steps
    else:
        warmup_steps = int(num_train_steps * args.warmup_ratio)
    assert warmup_steps is not None, 'Must specify either warmup_steps or warmup_ratio'
    log_interval = args.log_interval
    save_interval = args.save_interval if args.save_interval is not None\
        else num_train_steps // 50
    logging.info(
        '#Total Training Steps={}, Warmup={}, Save Interval={}'.format(
            num_train_steps, warmup_steps, save_interval))

    lr_scheduler = PolyScheduler(max_update=num_train_steps,
                                 base_lr=args.lr,
                                 warmup_begin_lr=0,
                                 pwr=1,
                                 final_lr=0,
                                 warmup_steps=warmup_steps,
                                 warmup_mode='linear')
    optimizer_params = {
        'learning_rate': args.lr,
        'wd': args.wd,
        'lr_scheduler': lr_scheduler,
    }
    if args.optimizer == 'adamw':
        optimizer_params.update({
            'beta1': 0.9,
            'beta2': 0.999,
            'epsilon': 1e-6,
            'correct_bias': False,
        })
    if args.comm_backend == 'horovod':
        trainer = hvd.DistributedTrainer(param_dict, args.optimizer,
                                         optimizer_params)
    else:
        trainer = mx.gluon.Trainer(param_dict,
                                   args.optimizer,
                                   optimizer_params,
                                   update_on_kvstore=False)
    if args.start_step:
        logging.info('Restart training from {}'.format(args.start_step))
        # TODO(zheyuye), How about data splitting, where to start re-training
        state_path = states_option(args.start_step, trainer, args.output_dir,
                                   local_rank, 'Loading')
        param_path = parameters_option(args.start_step, model, args.output_dir,
                                       'Loading')

    # prepare the loss function
    mlm_loss_fn = mx.gluon.loss.SoftmaxCELoss()
    rtd_loss_fn = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
    mlm_loss_fn.hybridize()
    rtd_loss_fn.hybridize()

    # prepare the records writer
    writer = None
    # only one process on each worker will write the tensorboardX's records to avoid race
    if args.do_eval and local_rank == 0:
        from tensorboardX import SummaryWriter
        record_path = os.path.join(args.output_dir, 'records')
        logging.info('Evaluation records saved in {}'.format(record_path))
        writer = SummaryWriter(record_path)

    step_num = args.start_step
    finish_flag = False

    log_total_loss = 0
    log_mlm_loss = 0
    log_rtd_loss = 0
    log_sample_num = 0
    train_start_time = time.time()

    # start training
    train_loop_dataloader = grouper(repeat(data_train), len(ctx_l))
    while step_num < num_train_steps:
        tic = time.time()
        for accum_idx in range(num_accumulated):
            sample_l = next(train_loop_dataloader)
            loss_l = []
            mlm_loss_l = []
            rtd_loss_l = []
            for sample, ctx in zip(sample_l, ctx_l):
                if sample is None:
                    continue
                # prepare data
                input_ids, segment_ids, valid_lengths = sample
                input_ids = input_ids.as_in_ctx(ctx)
                segment_ids = segment_ids.as_in_ctx(ctx)
                valid_lengths = valid_lengths.as_in_ctx(ctx)
                masked_input = data_masker.dynamic_masking(
                    mx.nd, input_ids, valid_lengths)
                masked_input_ids = masked_input.input_ids
                length_masks = masked_input.masks
                unmasked_tokens = masked_input.unmasked_tokens
                masked_positions = masked_input.masked_positions
                masked_weights = masked_input.masked_weights

                log_sample_num += len(masked_input_ids)

                with mx.autograd.record():
                    mlm_scores, rtd_scores, corrupted_tokens, labels = model(
                        masked_input_ids, segment_ids, valid_lengths,
                        unmasked_tokens, masked_positions)
                    denominator = (masked_weights.sum() +
                                   1e-6) * num_accumulated * len(ctx_l)
                    mlm_loss = mlm_loss_fn(
                        mx.npx.reshape(mlm_scores, (-5, -1)),
                        unmasked_tokens.reshape(
                            (-1, )), masked_weights.reshape(
                                (-1, 1))).sum() / denominator
                    denominator = (length_masks.sum() +
                                   1e-6) * num_accumulated * len(ctx_l)
                    rtd_loss = rtd_loss_fn(rtd_scores, labels,
                                           length_masks).sum() / denominator
                    output = ElectraOutput(
                        mlm_scores=mlm_scores,
                        rtd_scores=rtd_scores,
                        rtd_labels=labels,
                        corrupted_tokens=corrupted_tokens,
                    )
                    mlm_loss_l.append(mlm_loss)
                    rtd_loss_l.append(rtd_loss)
                    loss = (args.gen_weight * mlm_loss +
                            args.disc_weight * rtd_loss)
                    loss_l.append(loss)

            for loss in loss_l:
                loss.backward()
            # All Reduce the Step Loss
            log_mlm_loss += sum(
                [ele.as_in_ctx(ctx_l[0]) for ele in mlm_loss_l]).asnumpy()
            log_rtd_loss += sum(
                [ele.as_in_ctx(ctx_l[0]) for ele in rtd_loss_l]).asnumpy()
            log_total_loss += sum([ele.as_in_ctx(ctx_l[0])
                                   for ele in loss_l]).asnumpy()

        # update
        trainer.allreduce_grads()

        total_norm, ratio, is_finite = clip_grad_global_norm(
            params, args.max_grad_norm * num_workers)

        if args.comm_backend == 'horovod':
            # Note that horovod.trainer._scale is default to num_workers,
            # thus trainer.update(1) will scale the gradients by 1./num_workers
            trainer.update(1, ignore_stale_grad=True)
        else:
            # gluon.trainer._scale is default to 1
            trainer.update(num_workers, ignore_stale_grad=True)

        total_norm = total_norm / num_workers
        step_num += 1
        if num_accumulated > 1:
            # set grad to zero for gradient accumulation
            model.zero_grad()

        # saving
        if step_num % save_interval == 0 or step_num >= num_train_steps:
            if is_master_node:
                states_option(step_num, trainer, args.output_dir, local_rank,
                              'Saving')
                if local_rank == 0:
                    param_path = parameters_option(step_num, model,
                                                   args.output_dir, 'Saving')

        # logging
        if step_num % log_interval == 0:
            # Output the loss of per step
            log_mlm_loss /= log_interval
            log_rtd_loss /= log_interval
            log_total_loss /= log_interval
            toc = time.time()
            logging.info('[step {}], Loss mlm/rtd/total={:.4f}/{:.4f}/{:.4f},'
                         ' LR={:.6f}, grad_norm={:.4f}. Time cost={:.2f},'
                         ' Throughput={:.2f} samples/s, ETA={:.2f}h'.format(
                             step_num, log_mlm_loss, log_rtd_loss,
                             log_total_loss, trainer.learning_rate, total_norm,
                             toc - tic, log_sample_num / (toc - tic),
                             (num_train_steps - step_num) /
                             (step_num / (toc - train_start_time)) / 3600))
            tic = time.time()

            if args.do_eval:
                evaluation(writer, step_num, masked_input, output)
                if writer is not None:
                    writer.add_scalars(
                        'loss', {
                            'total_loss': log_total_loss,
                            'mlm_loss': log_mlm_loss,
                            'rtd_loss': log_rtd_loss
                        }, step_num)
            log_mlm_loss = 0
            log_rtd_loss = 0
            log_total_loss = 0
            log_sample_num = 0

    logging.info('Finish training step: %d', step_num)
    if is_master_node:
        state_path = states_option(step_num, trainer, args.output_dir,
                                   local_rank, 'Saving')
        if local_rank == 0:
            param_path = parameters_option(step_num, model, args.output_dir,
                                           'Saving')

    mx.npx.waitall()
    train_end_time = time.time()
    logging.info('Train cost={:.1f}s'.format(train_end_time -
                                             train_start_time))

    if writer is not None:
        writer.close()

    if local_rank == 0:
        model_name = args.model_name.replace('google', 'gluon')
        save_dir = os.path.join(args.output_dir, model_name)
        final_save(model, save_dir, tokenizer)
def evaluate(args):
    store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
        args.comm_backend, args.gpus)
    # setup_logging(args, local_rank)
    task = get_task(args.task_name, args.train_dir, args.eval_dir)
    level = logging.INFO
    detail_dir = os.path.join(args.output_dir, args.task_name)
    if not os.path.exists(detail_dir):
        os.mkdir(detail_dir)
    logging_config(
        detail_dir,
        name='train_{}_{}_'.format(args.task_name, args.model_name) +
        str(rank),  # avoid race
        level=level,
        console=(local_rank == 0))
    if rank != 0:
        logging.info('Skipping node {}'.format(rank))
        return
    ctx_l = parse_ctx(args.gpus)
    logging.info(
        'Srarting inference without horovod on the first node on device {}'.
        format(str(ctx_l)))

    cfg, tokenizer, classify_net, use_segmentation = \
        get_network(args.model_name, ctx_l,
                    args.param_checkpoint,
                    args.backbone_path,
                    task)
    candidate_ckpt = []
    detail_dir = os.path.join(args.output_dir, args.task_name)
    for name in os.listdir(detail_dir):
        if name.endswith(
                '.params'
        ) and args.task_name in name and args.model_name in name:
            candidate_ckpt.append(os.path.join(detail_dir, name))
    best_ckpt = {}
    metrics = task.metric

    def evaluate_by_ckpt(ckpt_name, best_ckpt):
        classify_net.load_parameters(ckpt_name, ctx=ctx_l, cast_dtype=True)
        logging.info('Prepare dev data')

        dev_data, label = get_task_data(args, task, tokenizer, segment='eval')
        dev_batchify = bf.Group(bf.Group(bf.Pad(), bf.Pad(), bf.Stack()),
                                bf.Stack())
        dataloader = DataLoader(dev_data,
                                batch_size=args.batch_size,
                                batchify_fn=dev_batchify,
                                shuffle=False)

        for sample_l in grouper(dataloader, len(ctx_l)):
            for sample, ctx in zip(sample_l, ctx_l):
                if sample is None:
                    continue
                (token_ids, token_types, valid_length), label = sample
                token_ids = mx.np.array(token_ids, ctx=ctx)
                token_types = mx.np.array(token_types, ctx=ctx)
                valid_length = mx.np.array(valid_length, ctx=ctx)
                scores = classify_net(token_ids, token_types, valid_length)

                if task.task_name == 'sts':
                    label = label.reshape((-1, 1))
                for metric in metrics:
                    metric.update([label], [scores])
                #pred.append(scores)

        for metric in metrics:
            metric_name, result = metric.get()
            logging.info('checkpoint {} get result: {}:{}'.format(
                ckpt_name, metric_name, result))
            if best_ckpt.get(metric_name, [0, ''])[0] < result:
                best_ckpt[metric_name] = [result, ckpt_name]
        for metric in metrics:
            metric.reset()

    for ckpt_name in candidate_ckpt:
        evaluate_by_ckpt(ckpt_name, best_ckpt)
    for metric_name in best_ckpt:
        logging.info(
            'best result on metric {}: is {}, and on checkpoint {}'.format(
                metric_name, best_ckpt[metric_name][0],
                best_ckpt[metric_name][1]))