def tokenize(self, text: Text) -> typing.List[Token]: if self.third_party_service_endpoint is not None: req = requests.post(self.third_party_service_endpoint, data={"text": text}) return [Token(v["text"], v["end"]) for v in req.json()] else: logger.warning( "Third party tokenizer component in pipeline, but no " "`third_party_service_endpoint` configuration in the config." ) return [Token(text, 0)]
def tokenize(self, text: Text) -> typing.List[Token]: if self.third_party_service_endpoint is not None: headers = {'Content-type': 'application/json', 'Accept': 'application/json'} req = requests.post(self.third_party_service_endpoint, data=json.dumps({"text": text}), headers=headers) return [Token(v["text"], v["end"]) for v in req.json()] else: logger.warning( "Third party tokenizer component in pipeline, but no " "`third_party_service_endpoint` configuration in the config." ) return [Token(text, 0)]
def test_tokens_comparison(): from rasa.nlu.tokenizers import Token x = Token("hello", 0) y = Token("Hello", 0) assert x == x assert y < x assert x != 1 with pytest.raises(TypeError): assert y < "a"
def tokenize(self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE) -> List[Token]: if not self.case_sensitive: text = text.lower() # remove 'not a word character' if if attribute != MESSAGE_INTENT_ATTRIBUTE: words = re.sub( # there is a space or an end of a string after it r"[^\w#@&]+(?=\s|$)|" # there is a space or beginning of a string before it # not followed by a number r"(\s|^)[^\w#@&]+(?=[^0-9\s])|" # not in between numbers and not . or @ or & or - or # # e.g. 10'000.00 or [email protected] # and not url characters r"(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])", " ", text, ).split() else: words = (text.split(self.intent_split_symbol) if self.intent_tokenization_flag else [text]) running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) return tokens
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def tokenize(self, text: Text, attribute=MESSAGE_TEXT_ATTRIBUTE) -> List[Token]: import jieba text = self.preprocess_text(text, attribute) tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
def tokenize(self, text: Text) -> List[Token]: import MicroTokenizer tokenized = MicroTokenizer.cut(text, **self.kwargs) tokens = [] offset = 0 for word in tokenized: tokens.append(Token(word, offset)) offset += len(word) return tokens
def tokenize(text: Text) -> List[Token]: def mecabsplit(mecab_tagger, inputs, pos): r = [] inputs = mecab_tagger.parse(inputs) t = inputs.split('\n')[:-2] for i in t: field = i.split('\t') if field[1].split(',')[-1] is not '*': r.extend([(x.split('/')[0], x.split('/')[1]) for x in field[1].split(',')[-1].split('+')]) else: r.append((field[0], field[1].split(',')[0])) if pos: return r else: return [x[0] for x in r] return r mecab_tagger = MeCab.Tagger() a = mecab_tagger.parse(text) t = a.split('\n')[:-2] tokenpointer = [] pointeroffset = 0 for i in t: field = i.split('\t') if field[1].split(',')[-1] is not '*': currentptr = text.index(field[0], pointeroffset) for x in field[1].split(',')[-1].split('+'): try: w = x.split('/')[0] temp = field[0].index(w) tokenpointer.append( (currentptr + temp, currentptr + temp + len(w))) except: tokenpointer.append( (currentptr, currentptr + len(field[0]))) pointeroffset = currentptr + len(field[0]) else: currentptr = text.index(field[0], pointeroffset) tokenpointer.append((currentptr, currentptr + len(field[0]))) pointeroffset = currentptr + len(field[0]) words = mecabsplit(mecab_tagger, text, False) tokens = [] offset = 0 for word in words: word_offset = tokenpointer[words.index(word, offset)][0] tokens.append(Token(word, word_offset)) offset += 1 return tokens
def tokenize(self, text: Text) -> List[Token]: # there is space or end of string after punctuation # because we do not want to replace 10.000 with 10 000 words = re.sub(r'[.,!?]+(\s|$)', ' ', text).split() running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) return tokens
def tokenize(text: Text) -> List[Token]: mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ko-dic") parsed = mt.parse(text) x = parsed.replace("\n", "\t").split("\t") words = [] for i in range(0, len(x) - 2, 2): w = x[i] words.append(w) running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) return tokens
def tokenize(text: Text) -> List[Token]: mt = MeCab.Tagger() parsed = mt.parse(text) x = (parsed.replace('\n', '\t').split('\t')) words = [] for i in range(0, len(x)-2, 2): w = x[i] words.append(w) running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) return tokens
def tokenize(text: Text) -> List[Token]: # remove 'not a word character' if words = re.sub( # there is a space or an end of a string after it r'[^\w#@&]+(?=\s|$)|' # there is a space or beginning of a string before it # not followed by a number r'(\s|^)[^\w#@&]+(?=[^0-9\s])|' # not in between numbers and not . or @ or & or - or # # e.g. 10'000.00 or [email protected] # and not url characters r'(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])', ' ', text).split() running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) return tokens
def tokenize(self, doc: "Doc") -> typing.List[Token]: return [Token(t.text, t.idx) for t in doc]
def _token_from_offset(self, text: bytes, offset: int, encoded_sentence: bytes) -> Token: return Token( text.decode(DEFAULT_ENCODING), self._byte_to_char_offset(encoded_sentence, offset), )
def process(self, message: 'Message', **kwargs: Any) -> None: """Process an incoming message. This is the components chance to process an incoming message. The component can rely on any context attribute to be present, that gets created by a call to :meth:`components.Component.pipeline_init` of ANY component and on any context attributes created by a call to :meth:`components.Component.process` of components previous to this one.""" self.sium.set_context(self.context) # TODO: lowercase IU # The Latest IU is being appended to # "iu_list" in the message, # so we grab last one out of that. iu_list = message.get("iu_list") new_iu = iu_list[-1] # Extract into tuple of (word, type) # where type is either an "add" or "revoke". iu_word, iu_type = new_iu # If it's an add, we have to update our intents # and extract any entities if they meet our threshold. # We also have to keep track of our word offset for # the entities message. if iu_type == "add": self.tokens.append(Token(iu_word, self.word_offset)) props, prop_dist = self.sium.add_word_increment({"word": iu_word}) for p in props: # if we have a confidence of 0.5, then # add that entity if prop_dist.prob(p) > 0.5: self.extracted_entities.append({ 'start': self.word_offset, 'end': self.word_offset + len(iu_word) - 1, 'value': iu_word, 'entity': p, 'confidence': prop_dist.prob(p), 'extractor': 'rasa_sium' }) self.word_offset += len(iu_word) elif iu_type == "revoke": # Need to undo everything above, remove tokens, # revoke word, remove extracted entities, subtract word_offset. self.word_offset -= len(iu_word) # Remove our latest token from our list. self.tokens.pop() # This is a bit more difficult, basically, if we have # our word show up in any extracted entities, then we # need to remove that entity from our list of entities. if self.extracted_entities: last_entity = self.extracted_entities[-1] if iu_word in last_entity.values(): self.extracted_entities.pop() self.sium.revoke() else: logger.error("incompatible iu type, expected 'add' or 'revoke'," " got '" + iu_type + "'") pred_intent, intent_ranks = self.__get_intents_and_ranks() message.set("intent", pred_intent, add_to_output=True) message.set("intent_ranking", intent_ranks) message.set("tokens", self.tokens) message.set("entities", self.extracted_entities, add_to_output=True)
"name": "DucklingHTTPExtractor" }, ] }) return utilities.interpreter_for( component_builder, data="./data/examples/rasa/demo-rasa.json", path=tmpdir_factory.mktemp("projects").strpath, config=conf, ) # Chinese Example # "对面食过敏" -> To be allergic to wheat-based food CH_wrong_segmentation = [ Token("对面", 0), Token("食", 2), Token("过敏", 3), # opposite, food, allergy ] CH_correct_segmentation = [ Token("对", 0), Token("面食", 1), Token("过敏", 3), # towards, wheat-based food, allergy ] CH_wrong_entity = {"start": 0, "end": 2, "value": "对面", "entity": "direction"} CH_correct_entity = { "start": 1, "end": 3, "value": "面食", "entity": "food_type" }
def tokenize(text: Text) -> List[Token]: import jieba tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
def _token_from_offset(self, text, offset, encoded_sentence): return Token(text.decode("utf-8"), self._byte_to_char_offset(encoded_sentence, offset))
def tokenize(self, doc: 'Doc') -> typing.List[Token]: return [Token(t.lemma_, t.idx) for t in doc]