def test_count_vector_featurizer_oov_words(sentence, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"], "return_sequence": True, }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all( test_message.get("text_sparse_features").toarray()[0] == expected)
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") if not mitie_feature_extractor: raise Exception("Failed to train 'MitieFeaturizer'. " "Missing a proper MITIE feature extractor.") if self.clf: token_strs = self._tokens_of_message(message) intent, confidence = self.clf(token_strs, mitie_feature_extractor) else: # either the model didn't get trained or it wasn't # provided with any data intent = None confidence = 0.0 message.set("intent", { "name": intent, "confidence": confidence }, add_to_output=True)
def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp): from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer doc = spacy_nlp(sentence) token_vectors = [t.vector for t in doc] ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig()) greet = {"intent": "greet", "text_features": [0.5]} message = Message(sentence, greet) message.set(SPACY_DOCS[TEXT], doc) ftr._set_spacy_features(message) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) vecs = seq_vecs[0][:5] assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4) assert np.allclose(vecs, expected, atol=1e-4) assert sen_vecs is not None
def test_convert_featurizer_process(): featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def set_fasttext_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None text_vector = self.model.get_word_vector(message.text) word_vectors = [ self.model.get_word_vector(t.text) for t in train_utils.tokens_without_cls(message, attribute) ] X = np.array(word_vectors + [text_vector]) # remember, we need one for __CLS__ features = self._combine_with_existing_dense_features( message, additional_features=X, feature_name=DENSE_FEATURE_NAMES[attribute]) message.set(DENSE_FEATURE_NAMES[attribute], features)
def test_count_vector_featurizer_char(sentence, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char", "return_sequence": True }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all( test_message.get("text_sparse_features").toarray()[0] == expected)
def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", intent) train_message.set("response", response) data = TrainingData([train_message]) ftr.train(data) assert np.all(train_message.get("text_features") == text_features) assert np.all(train_message.get("intent_features") == intent_features) assert np.all(train_message.get("response_features") == response_features)
def process(self, message: Message, **kwargs: Any): spans = message.get("spans", []) pronouns = [span for span in spans if span['label'] == 'Pronoun'] coreferences = [] for pronoun in pronouns: ent = self.stag(pronoun, message) if ent: coreferences.append({ "pronoun": { 'start': pronoun['start'], 'end': pronoun['end'] }, "entity": { 'start': ent['start'], 'end': ent['end'] } }) span_output_format(spans) message.set("coreferences", coreferences, add_to_output=True) logging.info("coref data: {}".format(message.data))
def test_text_featurizer_using_pos(sentence, expected, spacy_nlp): featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]}) train_message = Message(sentence) test_message = Message(sentence) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) featurizer.train(TrainingData([train_message])) featurizer.process(test_message) actual = test_message.get_sparse_features(TEXT, []) assert isinstance(actual, scipy.sparse.coo_matrix) assert np.all(actual.toarray() == expected)
def test_crf_use_dense_features(spacy_nlp): crf_extractor = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", "text_dense_features", ], ["low", "title", "upper", "pos", "pos2"], ] } ) spacy_featurizer = SpacyFeaturizer() spacy_tokenizer = SpacyTokenizer() text = "Rasa is a company in Berlin" message = Message(text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) spacy_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._from_text_to_crf(message) features = crf_extractor._sentence_to_features(text_data) assert "0:text_dense_features" in features[0] for i in range(0, len(message.data.get("text_dense_features")[0])): assert ( features[0]["0:text_dense_features"]["text_dense_features"][str(i)] == message.data.get("text_dense_features")[0][i] )
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer({}) message = Message(sentence, data={RESPONSE: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10) assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_elmo_featurizer_train(): featurizer = ElmoFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE_ATTRIBUTE, sentence) tokens = WhitespaceTokenizer().tokenize(message, attribute=TEXT_ATTRIBUTE) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT_ATTRIBUTE) message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens) message.set(TOKENS_NAMES[RESPONSE_ATTRIBUTE], tokens) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) assert vecs is None
def process(self, message: Message, **kwargs: Any) -> None: if self._url() is not None: reference_time = self._reference_time_from_message(message) matches = self._duckling_parse(message.text, reference_time) all_extracted = convert_duckling_format_to_rasa(matches) dimensions = self.component_config["dimensions"] extracted = DucklingHTTPExtractor.filter_irrelevant_entities( all_extracted, dimensions) else: extracted = [] warnings.warn("Duckling HTTP component in pipeline, but no " "`url` configuration in the config " "file nor is `RASA_DUCKLING_HTTP_URL` " "set as an environment variable.") extracted = self.add_extractor_name(extracted) message.set( ENTITIES_ATTRIBUTE, message.get(ENTITIES_ATTRIBUTE, []) + extracted, add_to_output=True, )
def test_mitie_featurizer_train(mitie_feature_extractor): featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") MitieTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message]), RasaNLUModelConfig(), **{"mitie_feature_extractor": mitie_feature_extractor}, ) expected = np.array([ 0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00 ]) expected_cls = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) seq_vec, sen_vec = message.get_dense_features(TEXT, []) assert len(message.get(TOKENS_NAMES[TEXT])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(RESPONSE, []) assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(INTENT, []) assert seq_vec is None assert sen_vec is None
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer() training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(sentence) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10) assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def process(self, message: Message, **kwargs: Any) -> None: if self.third_party_service_endpoint is not None: headers = { 'Content-type': 'application/json', 'Accept': 'application/json' } req = requests.post(self.third_party_service_endpoint, data=json.dumps({"text": message.text}), headers=headers) extracted = [ self.transform_to_extracted(v) for v in req.json() if v["domainType"] != "" ] else: logger.warning( "Third party tokenizer component in pipeline, but no " "`third_party_service_endpoint` configuration in the config.") extracted = [] extracted = self.add_extractor_name(extracted) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: """Process incoming message and compute and set features""" if self.vectorizers is None: logger.error("There is no trained CountVectorizer: " "component is either not trained or " "didn't receive enough training data") else: message_text = self._get_message_text_by_attribute( message, attribute=MESSAGE_TEXT_ATTRIBUTE) bag = (self.vectorizers[MESSAGE_TEXT_ATTRIBUTE].transform( [message_text]).toarray().squeeze()) message.set( MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], self._combine_with_existing_features( message, bag, feature_name=MESSAGE_VECTOR_FEATURE_NAMES[ MESSAGE_TEXT_ATTRIBUTE], ), )
def test_convert_featurizer_process(component_builder): tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) sentence = "Hey how are you today ?" message = Message(sentence) tokens = tokenizer.tokenize(message, attribute=TEXT) tokens = tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message, tf_hub_module=tokenizer.module) expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def process(self, message: Message, **kwargs: Any) -> None: from seq2label.server.paddle_inference import Inference real_result_dir = os.path.join(self.model_dir, self.result_dir) print(real_result_dir) # for cache if not self.predict_fn: self.predict_fn = Inference(real_result_dir) input_text = message.text best_result, candidate_ranking = self.predict_fn.infer(input_text) intent = {"name": best_result, "confidence": candidate_ranking[0][1]} intent_ranking = [{"name": name, "confidence": score} for name, score in candidate_ranking] message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True)
def test_count_vector_featurizer_no_sequence(sentence, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "return_sequence": False }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert isinstance(test_message.get("text_sparse_features"), scipy.sparse.coo_matrix) actual = test_message.get("text_sparse_features").toarray() assert np.all(actual == expected)
def process(self, message: Message, **kwargs: Any) -> None: iu_list = message.get('iu_list') last_iu = iu_list[-1] iu_word, iu_type = last_iu if iu_type == "add": bag = self.CVF.vectorizers[MESSAGE_TEXT_ATTRIBUTE].transform( [iu_word]).toarray().squeeze() return message.set("text_features", self._add_text_features(message, bag)) elif iu_type == "revoke": return self._revoke(message, iu_word) else: logger.error("incompatible iu type, expected 'add' or 'revoke'," " got '" + iu_type + "'")
def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens): component_config = { "intent_tokenization_flag": True, "intent_split_symbol": "+" } transformers_config = { "model_name": "bert" } # Test for one should be enough transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer(component_config) message = Message(text) message.set(INTENT, text) td = TrainingData([message]) transformers_nlp.train(td) lm_tokenizer.train(td) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def process(self, message: Message, **kwargs: Any): self.request = self.http_session.open() # extracted = self.add_extractor_name(self.extract_entities(text, nlp)) chains = self.extract_coref(message.text) self.request.close() if not chains: message.set("sentence", message.text) else: target_words = [] flatten_chains = [] for chain in chains: # 计算出实体词 words = [s['mention'] for s in chain] # [pair('该', 'r'), pair('员工', 'n')] auxiliary_words = words[0] # 备选的实体词 for i in range(1, len(words)): flags = [s.flag for s in list(jposseg.cut(words[i]))] if flags[0] != 'r' and len( words[i]) > len(auxiliary_words): auxiliary_words = words[i] target_words.append(auxiliary_words) # 拼接字典并排序 for s in chain: s.update({'replace': auxiliary_words}) flatten_chains.extend(chain) flatten_chains.sort(key=lambda x: x['start'], reverse=True) temp_sentence = list(message.text) for d in flatten_chains: temp_sentence[d['start']:d['end']] = d['replace'] sentence = ''.join(temp_sentence) message.set("sentence", sentence)
def process(self, message: Message, **kwargs: Any): """Process an incoming message. This is the components chance to process an incoming message. The component can rely on any context attribute to be present, that gets created by a call to :meth:`components.Component.pipeline_init` of ANY component and on any context attributes created by a call to :meth:`components.Component.process` of components previous to this one.""" # TODO 分词, 如果利用其它分词组件, 需要进一步调整 if not message.get("tokens", default=None): self.extract_tokens(message) # 词性标注 self.extract_poses(message) # 句法依存 self.extract_parses(message) # 抽取实体<序列标注+实体提取> self.extract_entities(message) # 抽取代词 self.extract_pronouns(message) else: # rasa tokenizers tokens = message.get("tokens") message.set("tokenizers", tokens) # List tokens tokens = [tokenizer_extract(token) for token in tokens] message.set("tokens", tokens) self.extract_poses(message) # 句法依存 self.extract_parses(message) # 抽取实体<序列标注+实体提取> # 语义分割 -> self.entity_segment(message) # 属性分析 -> self.link_analyze(message)
def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): ftr = CountVectorsFeaturizer({"use_shared_vocab": True}) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
def process(self, message: Message, **kwargs: Any) -> None: """Process incoming message and compute and set features""" if self.vectorizers is None: logger.error("There is no trained CountVectorizer: " "component is either not trained or " "didn't receive enough training data") return attribute = TEXT_ATTRIBUTE message_tokens = self._get_processed_message_tokens_by_attribute( message, attribute) # features shape (1, seq, dim) features = self._create_sequence(attribute, [message_tokens]) message.set( SPARSE_FEATURE_NAMES[attribute], self._combine_with_existing_sparse_features( message, features[0], # 0 -> batch dimension feature_name=SPARSE_FEATURE_NAMES[attribute], ), )
def _set_spacy_features(self, message: Message, attribute: Text = TEXT) -> None: """Adds the spacy word vectors to the messages features.""" doc = self.get_doc(message, attribute) if doc is None: return # in case an empty spaCy model was used, no vectors are present if doc.vocab.vectors_length == 0: logger.debug( "No features present. You are using an empty spaCy model.") return features = self._features_for_doc(doc) cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation) features = np.concatenate([features, cls_token_vec]) features = self._combine_with_existing_dense_features( message, features, DENSE_FEATURE_NAMES[attribute]) message.set(DENSE_FEATURE_NAMES[attribute], features)
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT], tokens_feature) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)