def test_preserve_sentence_and_sequence_features_old_config(): attribute = "text" message = Message.build("hi there") transformers_nlp = HFTransformersNLP({ "model_name": "bert", "model_weights": "bert-base-uncased" }) transformers_nlp.process(message) lm_tokenizer = LanguageModelTokenizer() lm_tokenizer.process(message) lm_featurizer = LanguageModelFeaturizer({"model_name": "gpt2"}) lm_featurizer.process(message) message.set(LANGUAGE_MODEL_DOCS[attribute], None) lm_docs = lm_featurizer._get_docs_for_batch([message], attribute=attribute, inference_mode=True)[0] hf_docs = transformers_nlp._get_docs_for_batch([message], attribute=attribute, inference_mode=True)[0] assert not (message.features[0].features == lm_docs[SEQUENCE_FEATURES]).any() assert not (message.features[1].features == lm_docs[SENTENCE_FEATURES]).any() assert (message.features[0].features == hf_docs[SEQUENCE_FEATURES]).all() assert (message.features[1].features == hf_docs[SENTENCE_FEATURES]).all()
def test_lm_tokenizer_edge_cases( model_name, model_weights, texts, expected_tokens, expected_indices, expected_num_token_ids, ): if model_weights is None: model_weights_config = {} else: model_weights_config = {"model_weights": model_weights} transformers_config = { **{ "model_name": model_name }, **model_weights_config } transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer() for text, gt_tokens, gt_indices, gt_num_indices in zip( texts, expected_tokens, expected_indices, expected_num_token_ids): message = Message.build(text=text) transformers_nlp.process(message) tokens = lm_tokenizer.tokenize(message, TEXT) token_ids = message.get(LANGUAGE_MODEL_DOCS[TEXT])[TOKEN_IDS] assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] assert [t.end for t in tokens] == [i[1] for i in gt_indices] assert len(token_ids) == gt_num_indices
def process_texts(texts: List[Text], model_name: Text, model_weights: Text) -> List[Message]: config = create_pretrained_transformers_config(model_name, model_weights) whitespace_tokenizer = WhitespaceTokenizer() transformer = HFTransformersNLP(config) messages = [] for text in texts: message = Message.build(text=text) whitespace_tokenizer.process(message) transformer.process(message) messages.append(message) return messages
def test_log_longer_sequence(sequence_length: int, model_name: Text, should_overflow: bool, caplog): transformers_config = {"model_name": model_name} transformers_nlp = HFTransformersNLP(transformers_config) text = " ".join(["hi"] * sequence_length) message = Message(text) caplog.set_level(logging.DEBUG) transformers_nlp.process(message) if should_overflow: assert "hi hi hi" in caplog.text assert message.get("text_language_model_doc") is not None
def test_log_deprecation_warning_with_old_config(caplog: LogCaptureFixture): message = Message.build("hi there") transformers_nlp = HFTransformersNLP( {"model_name": "bert", "model_weights": "bert-base-uncased"} ) transformers_nlp.process(message) caplog.set_level(logging.DEBUG) lm_tokenizer = LanguageModelTokenizer() lm_tokenizer.process(message) lm_featurizer = LanguageModelFeaturizer(skip_model_load=True) caplog.clear() with caplog.at_level(logging.DEBUG): lm_featurizer.process(message) assert "deprecated component HFTransformersNLP" in caplog.text
def test_lm_tokenizer_edge_cases(model_name, texts, expected_tokens, expected_indices): transformers_config = {"model_name": model_name} transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer() for text, gt_tokens, gt_indices in zip(texts, expected_tokens, expected_indices): message = Message.build(text=text) transformers_nlp.process(message) tokens = lm_tokenizer.tokenize(message, TEXT) assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] assert [t.end for t in tokens] == [i[1] for i in gt_indices]
def test_hf_transformer_edge_cases( model_name, model_weights, texts, expected_tokens, expected_indices ): if model_weights is None: model_weights_config = {} else: model_weights_config = {"model_weights": model_weights} transformers_config = {**{"model_name": model_name}, **model_weights_config} hf_transformer = HFTransformersNLP(transformers_config) whitespace_tokenizer = WhitespaceTokenizer() for text, gt_tokens, gt_indices in zip(texts, expected_tokens, expected_indices): message = Message.build(text=text) tokens = whitespace_tokenizer.tokenize(message, TEXT) message.set(TOKENS_NAMES[TEXT], tokens) hf_transformer.process(message) assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] assert [t.end for t in tokens] == [i[1] for i in gt_indices]
from rasa.nlu.constants import (TEXT, SPACY_DOCS) logger = logging_setup() test_input = "Okay, pick up this yellow banana for me." message = Message(test_input) tk = WhitespaceTokenizer() tokens = tk.tokenize(message, attribute=TEXT) logger.info('Whitespace: {}'.format([t.text for t in tokens])) tk = SpacyTokenizer() message.set(SPACY_DOCS[TEXT], spacy_nlp(test_input)) tokens = tk.tokenize(message, attribute=TEXT) logger.info('SpaCy: {}'.format([t.text for t in tokens])) tk = MitieTokenizer() tokens = tk.tokenize(message, attribute=TEXT) logger.info('Mitie: {}'.format([t.text for t in tokens])) tk = ConveRTTokenizer() tokens = tk.tokenize(message, attribute=TEXT) logger.info('ConveRT: {}'.format([t.text for t in tokens])) tk = LanguageModelTokenizer() transformers_nlp = HFTransformersNLP({"model_name": "bert"}) transformers_nlp.process(message) tokens = tk.tokenize(message, attribute=TEXT) logger.info('BERT: {}'.format([t.text for t in tokens]))