Exemplo n.º 1
0
if __name__ == '__main__':
    random_seed = random.randint(0, 1000)
    nlp.utils.mkdir(args.ckpt_dir)
    ctx = mx.gpu(local_rank)

    dataset_name, vocab = args.dataset_name, None
    if args.sentencepiece:
        logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
        if args.dataset_name:
            warnings.warn('Both --dataset_name and --sentencepiece are provided. '
                          'The vocabulary will be loaded based on --sentencepiece')
            dataset_name = None
        vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)

    model, nsp_loss, mlm_loss, vocab = get_model_loss([ctx], args.model, args.pretrained,
                                                      dataset_name, vocab, args.dtype,
                                                      ckpt_dir=args.ckpt_dir,
                                                      start_step=args.start_step)
    logging.debug('Model created')
    data_eval = args.data_eval

    if args.raw:
        if args.sentencepiece:
            tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab,
                                                 num_best=args.sp_nbest,
                                                 alpha=args.sp_alpha, lower=not args.cased)
        else:
            tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=not args.cased)

        cache_dir = os.path.join(args.ckpt_dir, 'data_eval_cache')
        cache_file = os.path.join(cache_dir, 'part-000.npz')
        nlp.utils.mkdir(cache_dir)
Exemplo n.º 2
0
    dataset_name, vocab = args.dataset_name, None
    if args.sentencepiece:
        logging.info('loading vocab file from sentence piece model: %s',
                     args.sentencepiece)
        if args.dataset_name:
            warnings.warn(
                'Both --dataset_name and --sentencepiece are provided. '
                'The vocabulary will be loaded based on --sentencepiece')
            dataset_name = None
        vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)

    model, vocab = get_model_loss(ctxs,
                                  args.model,
                                  args.pretrained,
                                  dataset_name,
                                  vocab,
                                  args.dtype,
                                  ckpt_dir=args.ckpt_dir,
                                  start_step=args.start_step)
    logging.info('Model created')
    data_eval = args.data_eval

    if args.raw:
        if args.sentencepiece:
            tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece,
                                                 vocab,
                                                 lower=not args.cased)
        else:
            tokenizer = nlp.data.BERTTokenizer(vocab=vocab,
                                               lower=not args.cased)