def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None: """Adds the precomputed word vectors to the messages features.""" doc = self._get_doc(message, attribute) if doc is None: return sequence_features = doc[SEQUENCE_FEATURES] sentence_features = doc[SENTENCE_FEATURES] final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _set_attribute_features( self, attribute: Text, sequence_features: List[scipy.sparse.spmatrix], sentence_features: List[scipy.sparse.spmatrix], examples: List[Message], ) -> None: """Set computed features of the attribute to corresponding message objects""" for i, message in enumerate(examples): # create bag for each example if sequence_features[i] is not None: final_sequence_features = Features( sequence_features[i], FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) if sentence_features[i] is not None: final_sentence_features = Features( sentence_features[i], FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _set_spacy_features(self, message: Message, attribute: Text = TEXT) -> None: """Adds the spacy word vectors to the messages features.""" doc = self.get_doc(message, attribute) if doc is None: return # in case an empty spaCy model was used, no vectors are present if doc.vocab.vectors_length == 0: logger.debug( "No features present. You are using an empty spaCy model.") return sequence_features = self._features_for_doc(doc) sentence_features = self._calculate_sentence_features( sequence_features, self.pooling_operation) final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def test_combine_with_existing_dense_features_shape_mismatch(): existing_features = Features(np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "test") new_features = np.array([[0, 1]]) with pytest.raises(ValueError): existing_features.combine_with_features(new_features)
def test_combine_with_existing_dense_features(): existing_features = Features(np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test") new_features = np.array([[1, 0], [0, 1]]) expected_features = np.array([[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]]) actual_features = existing_features.combine_with_features(new_features) assert np.all(expected_features == actual_features)
def test_combine_with_existing_sparse_features_shape_mismatch(): existing_features = Features( scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test" ) new_features = scipy.sparse.csr_matrix([[0, 1]]) with pytest.raises(ValueError): existing_features.combine_with_features(new_features)
def test_combine_with_existing_sparse_features(): existing_features = Features( scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test" ) new_features = scipy.sparse.csr_matrix([[1, 0], [0, 1]]) expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]] actual_features = existing_features.combine_with_features(new_features) actual_features = actual_features.toarray() assert np.all(expected_features == actual_features)
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = self._mitie_feature_extractor(**kwargs) tokens = train_utils.tokens_without_cls(message) features = self.features_for_tokens(tokens, mitie_feature_extractor) final_features = Features( features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS]) message.add_features(final_features)
def process( self, message: Message, *, tf_hub_module: Any = None, **kwargs: Any ) -> None: features = self._compute_features([message], tf_hub_module)[0] final_features = Features( features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS] ) message.add_features(final_features)
def _text_features_with_regex(self, message: Message, attribute: Text) -> None: if self.known_patterns: features = self._features_for_patterns(message, attribute) if features is not None: final_features = Features( features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS]) message.add_features(final_features)
def process_training_example(self, example: Message, attribute: Text, mitie_feature_extractor: Any): tokens = train_utils.tokens_without_cls(example, attribute) if tokens is not None: features = self.features_for_tokens(tokens, mitie_feature_extractor) final_features = Features( features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS]) example.add_features(final_features)
def _set_attribute_features(self, attribute: Text, attribute_features: List, training_data: TrainingData) -> None: """Set computed features of the attribute to corresponding message objects""" for i, message in enumerate(training_data.training_examples): # create bag for each example if attribute_features[i] is not None: final_features = Features( attribute_features[i], attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_features)
def _set_features( self, message: Message, sequence_features: np.ndarray, sentence_features: np.ndarray, attribute: Text, ): final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _text_features_with_regex(self, message: Message, attribute: Text) -> None: if self.known_patterns: sequence_features, sentence_features = self._features_for_patterns( message, attribute ) if sequence_features is not None: final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) if sentence_features is not None: final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _set_features( self, examples: List[Message], sequence_features: np.ndarray, sentence_features: np.ndarray, attribute: Text, ) -> None: for index, example in enumerate(examples): _sequence_features = Features( sequence_features[index], FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) example.add_features(_sequence_features) _sentence_features = Features( sentence_features[index], FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) example.add_features(_sentence_features)
def train( self, training_data: TrainingData, config: Optional[RasaNLUModelConfig] = None, *, tf_hub_module: Any = None, **kwargs: Any, ) -> None: if config is not None and config.language != "en": common_utils.raise_warning( f"Since ``ConveRT`` model is trained only on an english " f"corpus of conversations, this featurizer should only be " f"used if your training data is in english language. " f"However, you are training in '{config.language}'. ", docs=DOCS_URL_COMPONENTS + "#convertfeaturizer", ) batch_size = 64 for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: non_empty_examples = list( filter(lambda x: x.get(attribute), training_data.training_examples) ) progress_bar = tqdm( range(0, len(non_empty_examples), batch_size), desc=attribute.capitalize() + " batches", ) for batch_start_index in progress_bar: batch_end_index = min( batch_start_index + batch_size, len(non_empty_examples) ) # Collect batch examples batch_examples = non_empty_examples[batch_start_index:batch_end_index] batch_features = self._compute_features( batch_examples, tf_hub_module, attribute ) for index, ex in enumerate(batch_examples): features = Features( batch_features[index], attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) ex.add_features(features)
def _create_sparse_features(self, message: Message) -> None: """Convert incoming messages into sparse features using the configured features.""" import scipy.sparse # [:-1] to remove CLS token tokens = message.get(TOKENS_NAMES[TEXT])[:-1] sentence_features = self._tokens_to_features(tokens) one_hot_feature_vector = self._features_to_one_hot(sentence_features) sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector) final_features = Features( sparse_features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS] ) message.add_features(final_features)
def process(self, message: Message, **kwargs: Any) -> None: """Process incoming message and compute and set features""" if self.vectorizers is None: logger.error("There is no trained CountVectorizer: " "component is either not trained or " "didn't receive enough training data") return attribute = TEXT message_tokens = self._get_processed_message_tokens_by_attribute( message, attribute) # features shape (1, seq, dim) features = self._create_sequence(attribute, [message_tokens]) if features[0] is not None: final_features = Features( features[0], attribute, self.component_config[FEATURIZER_CLASS_ALIAS]) message.add_features(final_features)
def _set_spacy_features(self, message: Message, attribute: Text = TEXT) -> None: """Adds the spacy word vectors to the messages features.""" doc = self.get_doc(message, attribute) if doc is None: return # in case an empty spaCy model was used, no vectors are present if doc.vocab.vectors_length == 0: logger.debug( "No features present. You are using an empty spaCy model.") return features = self._features_for_doc(doc) cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation) features = np.concatenate([features, cls_token_vec]) final_features = Features( features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS]) message.add_features(final_features)
for i, o in enumerate(output): assert isinstance(o, np.ndarray) assert o[0][i] == 1 assert o.shape == (1, len(label_features)) @pytest.mark.parametrize( "messages, expected", [ ( [ Message( "test a", features=[ Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"), Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"), ], ), Message( "test b", features=[ Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"), Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"), ], ), ], True, ), ( [
from typing import Optional, Text, List import pytest import numpy as np import scipy.sparse from rasa.nlu.featurizers.featurizer import Features from rasa.nlu.constants import TEXT from rasa.nlu.training_data import Message @pytest.mark.parametrize( "features, attribute, featurizers, expected_features", [ (None, TEXT, [], None), ([Features(np.array([1, 1, 0]), TEXT, "test")], TEXT, [], [1, 1, 0]), ( [ Features(np.array([1, 1, 0]), TEXT, "c2"), Features(np.array([1, 2, 2]), TEXT, "c1"), Features(np.array([1, 2, 1]), TEXT, "c1"), ], TEXT, [], [1, 2, 1, 1, 2, 2, 1, 1, 0], ), ( [ Features(np.array([1, 1, 0]), TEXT, "c1"), Features(np.array([1, 2, 1]), TEXT, "test"), Features(np.array([1, 1, 1]), TEXT, "test"),
import pytest import numpy as np import scipy.sparse from rasa.nlu.featurizers.featurizer import Features from rasa.nlu.constants import TEXT, FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE from rasa.nlu.training_data import Message @pytest.mark.parametrize( "features, attribute, featurizers, expected_seq_features, expected_sen_features", [ (None, TEXT, [], None, None), ( [Features(np.array([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "test")], TEXT, [], [1, 1, 0], None, ), ( [ Features(np.array([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "c2"), Features(np.array([1, 2, 2]), FEATURE_TYPE_SENTENCE, TEXT, "c1"), Features(np.array([1, 2, 1]), FEATURE_TYPE_SEQUENCE, TEXT, "c1"), ], TEXT, [], [1, 2, 1, 1, 1, 0], [1, 2, 2],