def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", intent) train_message.set("response", response) data = TrainingData([train_message]) ftr.train(data) assert train_message.get("intent_features") == intent_features assert train_message.get("response_features") == response_features
def test_count_vector_featurizer_char_intent_featurizer(): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char" }) td = training_data.load_data("data/examples/rasa/demo-rasa.json") ftr.train(td, config=None) intent_features_exist = np.array([ True if example.get("intent_features") is not None else False for example in td.intent_examples ]) # no intent features should have been set assert not any(intent_features_exist)
def test_count_vector_featurizer_shared_vocab( sentence, intent, response, text_features, intent_features, response_features ): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer( {"token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True} ) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", intent) train_message.set("response", response) data = TrainingData([train_message]) ftr.train(data) assert np.all(train_message.get("text_features") == text_features) assert np.all(train_message.get("intent_features") == intent_features) assert np.all(train_message.get("response_features") == response_features)
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer_persist_load(tmpdir): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer # set non default values to config config = { "analyzer": "char", "token_pattern": r"(?u)\b\w+\b", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, "max_df": 3, "min_ngram": 2, "max_ngram": 3, "max_features": 10, "lowercase": False, } train_ftr = CountVectorsFeaturizer(config) sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà" sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà" train_message1 = Message(sentence1) train_message2 = Message(sentence2) # this is needed for a valid training example train_message1.set("intent", "bla") train_message2.set("intent", "bla") data = TrainingData([train_message1, train_message2]) train_ftr.train(data) # persist featurizer file_dict = train_ftr.persist("ftr", tmpdir.strpath) train_vect_params = train_ftr.vectorizer.get_params() # add trained vocabulary to vectorizer params train_vect_params.update({"vocabulary": train_ftr.vectorizer.vocabulary_}) # load featurizer meta = train_ftr.component_config.copy() meta.update(file_dict) test_ftr = CountVectorsFeaturizer.load(meta, tmpdir.strpath) test_vect_params = test_ftr.vectorizer.get_params() assert train_vect_params == test_vect_params test_message1 = Message(sentence1) test_ftr.process(test_message1) test_message2 = Message(sentence2) test_ftr.process(test_message2) # check that train features and test features after loading are the same assert np.all([ train_message1.get("text_features") == test_message1.get( "text_features"), train_message2.get("text_features") == test_message2.get( "text_features"), ])
def test_count_vector_featurizer(sentence, expected): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer_char(sentence, expected): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char"}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def __init__(self, component_config=None): super(IncrementalCVF, self).__init__(component_config) self.CVF = CountVectorsFeaturizer()
def required_packages(cls) -> List[Text]: reqs = CountVectorsFeaturizer.required_packages() reqs.append("numpy") return reqs
class IncrementalCVF(IncrementalComponent): name = "IncrementalCVF" """ Since this is a wrapper for the non-incremental CountVectorsFeaturizer to be used with our incremental EmbeddingIntentClassifier, we just need to take its provides, requires, and defaults. """ provides = CountVectorsFeaturizer.provides requires = CountVectorsFeaturizer.requires defaults = CountVectorsFeaturizer.defaults @classmethod def required_packages(cls) -> List[Text]: reqs = CountVectorsFeaturizer.required_packages() reqs.append("numpy") return reqs def __init__(self, component_config=None): super(IncrementalCVF, self).__init__(component_config) self.CVF = CountVectorsFeaturizer() # we don't have anything to clear since our featuers are storeed # in the Message, which the IncrementalInterpreter clears. def new_utterance(self) -> None: return def train(self, training_data: TrainingData, cfg: RasaNLUModelConfig = None, **kwargs: Any) -> None: return self.CVF.train(training_data, cfg, **kwargs) # Similar to Featurizer's _combine_with_existing_text_features # Except we are doing a vector sum instead of array stack. This # is because we're adding the new features of that word in particular # rather than entire utterances side by side. def _add_text_features(self, message, additional_features): if message.get("text_features") is not None: return np.add(message.get("text_features"), additional_features) else: return additional_features # On revoke, remove the word's features from the vector def _sub_text_features(self, message, to_sub): if message.get("text_features") is not None: return np.subtract(message.get("text_features"), to_sub) #else: # logger.error("Nothing in text features, cannot subtract") # assuming not using spacy_doc or tokens, so just setting message.text def process(self, message: Message, **kwargs: Any) -> None: iu_list = message.get('iu_list') last_iu = iu_list[-1] iu_word, iu_type = last_iu if iu_type == "add": bag = self.CVF.vectorizers[MESSAGE_TEXT_ATTRIBUTE].transform( [iu_word]).toarray().squeeze() return message.set("text_features", self._add_text_features(message, bag)) elif iu_type == "revoke": return self._revoke(message, iu_word) else: logger.error("incompatible iu type, expected 'add' or 'revoke'," " got '" + iu_type + "'") # TODO: can we just subtract the vector instead of # storing previous features? def _revoke(self, message, word): # revoke on empty should do nothing if message.get("text_features") is not None: return else: bag = self.CVF.vectorizers[MESSAGE_TEXT_ATTRIBUTE].transform( [word]).toarray().squeeze() return message.set("text_features", self._sub_text_features(message, bag)) def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]: file_name = file_name + ".pkl" featurizer_file = os.path.join(model_dir, file_name) utils.json_pickle(featurizer_file, self) return {"file": file_name} @classmethod def load(cls, meta: Dict[Text, Any], model_dir: Text = None, model_metadata: Metadata = None, cached_component: Optional['IncrementalCVF'] = None, **kwargs: Any) -> 'IncrementalCVF': if model_dir and meta.get("file"): file_name = meta.get("file") featurizer_file = os.path.join(model_dir, file_name) return utils.json_unpickle(featurizer_file) else: logger.warning("Failed to load featurizer. Maybe path {} " "doesn't exist".format(os.path.abspath(model_dir))) return IncrementalCVF(meta)