def test_lm_tokenizer_edge_cases( model_name, model_weights, texts, expected_tokens, expected_indices, expected_num_token_ids, ): if model_weights is None: model_weights_config = {} else: model_weights_config = {"model_weights": model_weights} transformers_config = { **{ "model_name": model_name }, **model_weights_config } transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer() for text, gt_tokens, gt_indices, gt_num_indices in zip( texts, expected_tokens, expected_indices, expected_num_token_ids): message = Message.build(text=text) transformers_nlp.process(message) tokens = lm_tokenizer.tokenize(message, TEXT) token_ids = message.get(LANGUAGE_MODEL_DOCS[TEXT])[TOKEN_IDS] assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] assert [t.end for t in tokens] == [i[1] for i in gt_indices] assert len(token_ids) == gt_num_indices
def test_lm_tokenizer_edge_cases(model_name, texts, expected_tokens, expected_indices): transformers_config = {"model_name": model_name} transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer() for text, gt_tokens, gt_indices in zip(texts, expected_tokens, expected_indices): message = Message.build(text=text) transformers_nlp.process(message) tokens = lm_tokenizer.tokenize(message, TEXT) assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] assert [t.end for t in tokens] == [i[1] for i in gt_indices]