def test_tokens_to_text(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) result = tokenizer.tokens_to_text(tokens) self.assertTrue(text == result)
def test_add_special_tokens(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = ["[CLS]", "[MASK]", "[SEP]"] tokenizer.add_special_tokens(special_tokens) self.assertTrue(tokenizer.vocab_size == tokenizer.original_vocab_size + len(special_tokens))
def test_text_to_tokens(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = nemo_nlp.data.tokenizers.MODEL_SPECIAL_TOKENS['bert'] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) self.assertTrue(len(tokens) == len(text.split())) self.assertTrue(tokens.count("[CLS]") == 1) self.assertTrue(tokens.count("[MASK]") == 1) self.assertTrue(tokens.count("[SEP]") == 2)
def test_text_to_ids(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = ["[CLS]", "[MASK]", "[SEP]"] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" ids = tokenizer.text_to_ids(text) self.assertTrue(len(ids) == len(text.split())) self.assertTrue(ids.count(tokenizer.special_tokens["[CLS]"]) == 1) self.assertTrue(ids.count(tokenizer.special_tokens["[MASK]"]) == 1) self.assertTrue(ids.count(tokenizer.special_tokens["[SEP]"]) == 2)
def test_ids_to_text(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = nemo_nlp.data.tokenizers.MODEL_SPECIAL_TOKENS['bert'] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" ids = tokenizer.text_to_ids(text) result = tokenizer.ids_to_text(ids) self.assertTrue(text == result)
def test_ids_to_tokens(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = nemo_nlp.data.tokenizers.MODEL_SPECIAL_TOKENS['bert'] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) ids = tokenizer.tokens_to_ids(tokens) result = tokenizer.ids_to_tokens(ids) self.assertTrue(len(result) == len(tokens)) for i in range(len(result)): self.assertTrue(result[i] == tokens[i])
def test_ids_to_text(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = ["[CLS]", "[MASK]", "[SEP]"] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" ids = tokenizer.text_to_ids(text) result = tokenizer.ids_to_text(ids) self.assertTrue(text == result)
def test_ids_to_tokens(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = ["[CLS]", "[MASK]", "[SEP]"] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) ids = tokenizer.tokens_to_ids(tokens) result = tokenizer.ids_to_tokens(ids) self.assertTrue(len(result) == len(tokens)) for i in range(len(result)): self.assertTrue(result[i] == tokens[i])
def test_add_special_tokens(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = nemo_nlp.data.tokenizers.MODEL_SPECIAL_TOKENS['bert'] tokenizer.add_special_tokens(special_tokens) self.assertTrue(tokenizer.vocab_size == tokenizer.original_vocab_size + len(set(special_tokens.values())))
output_file = f'{nf.work_dir}/output.txt' if args.bert_checkpoint is None: """ Use this if you're using a standard BERT model. To see the list of pretrained models, call: nemo_nlp.nm.trainables.huggingface.BERT.list_pretrained_models() """ tokenizer = NemoBertTokenizer(args.pretrained_bert_model) model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) else: """ Use this if you're using a BERT model that you pre-trained yourself. """ if args.tokenizer == "sentencepiece": special_tokens = nemo_nlp.utils.MODEL_SPECIAL_TOKENS['bert'] tokenizer = SentencePieceTokenizer(model_path=args.tokenizer_model) elif args.tokenizer == "nemobert": tokenizer = NemoBertTokenizer(args.pretrained_bert_model) else: raise ValueError(f"received unexpected tokenizer '{args.tokenizer}'") if args.bert_config is not None: with open(args.bert_config) as json_file: config = json.load(json_file) model = nemo_nlp.nm.trainables.huggingface.BERT(**config) else: model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) model.restore_from(args.bert_checkpoint) logging.info(f"Model restored from {args.bert_checkpoint}") hidden_size = model.hidden_size
output_file = f'{nf.work_dir}/output.txt' if args.bert_checkpoint is None: """ Use this if you're using a standard BERT model. To see the list of pretrained models, call: nemo_nlp.huggingface.BERT.list_pretrained_models() """ tokenizer = NemoBertTokenizer(args.pretrained_bert_model) model = nemo_nlp.nm.trainables.huggingface.BERT( pretrained_model_name=args.pretrained_bert_model) else: """ Use this if you're using a BERT model that you pre-trained yourself. """ if args.tokenizer == "sentencepiece": special_tokens = nemo_nlp.utils.MODEL_SPECIAL_TOKENS['bert'] tokenizer = SentencePieceTokenizer(model_path=args.tokenizer_model, special_tokens=special_tokens) elif args.tokenizer == "nemobert": tokenizer = NemoBertTokenizer(args.pretrained_bert_model) else: raise ValueError(f"received unexpected tokenizer '{args.tokenizer}'") if args.bert_config is not None: with open(args.bert_config) as json_file: config = json.load(json_file) model = nemo_nlp.nm.trainables.huggingface.BERT(**config) else: model = nemo_nlp.nm.trainables.huggingface.BERT( pretrained_model_name=args.pretrained_bert_model) model.restore_from(args.bert_checkpoint) logging.info(f"Model restored from {args.bert_checkpoint}")