def test_whitespace(): from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer tk = WhitespaceTokenizer() assert tk.tokenize(u"Hi. My name is rasa") == [ u'Hi.', u'My', u'name', u'is', u'rasa' ] assert tk.tokenize(u"hello ńöñàśçií") == [u'hello', u'ńöñàśçií']
def __init__(self, resource_name, backend, language_name): self.intent_examples = [] self.entity_examples = [] self.resource_name = resource_name self.files = util.recursively_find_files(resource_name) self.fformat = self.guess_format(self.files) self.tokenizer = None self.language_name = language_name if backend in ['mitie', 'mitie_sklearn']: from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer self.tokenizer = MITIETokenizer() elif backend in ['spacy_sklearn']: from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer self.tokenizer = SpacyTokenizer(language_name) else: from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer self.tokenizer = WhitespaceTokenizer() warnings.warn( "backend not recognised by TrainingData : defaulting to tokenizing by splitting on whitespace" ) if self.fformat == 'luis': self.load_luis_data(self.files[0]) elif self.fformat == 'wit': self.load_wit_data(self.files[0]) elif self.fformat == 'api': self.load_api_data(self.files) elif self.fformat == 'rasa_nlu': self.load_data(self.files[0]) else: raise ValueError("unknown training file format : {0}".format( self.fformat))
def test_whitespace(): from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer tk = WhitespaceTokenizer() assert [t.text for t in tk.tokenize("Forecast for lunch")] == ['Forecast', 'for', 'lunch'] assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13] assert [t.text for t in tk.tokenize("hey ńöñàśçií how're you?")] == ['hey', 'ńöñàśçií', 'how\'re', 'you?'] assert [t.offset for t in tk.tokenize("hey ńöñàśçií how're you?")] == [0, 4, 13, 20]
def init_tokenizer(self, backend, nlp): if backend in [mitie.MITIE_BACKEND_NAME, mitie.MITIE_SKLEARN_BACKEND_NAME]: from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer self.tokenizer = MITIETokenizer() elif backend in [spacy.SPACY_BACKEND_NAME]: from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer self.tokenizer = SpacyTokenizer(nlp) else: from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer self.tokenizer = WhitespaceTokenizer() warnings.warn( "backend not recognised by TrainingData : defaulting to tokenizing by splitting on whitespace")
def test_multiword_entities(): data = """ { "rasa_nlu_data": { "common_examples" : [ { "text": "show me flights to New York City", "intent": "unk", "entities": [ { "entity": "destination", "start": 19, "end": 32, "value": "New York City" } ] } ] } }""" with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f: f.write(data.encode("utf-8")) f.flush() td = load_data(f.name) assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example.get("entities") assert len(entities) == 1 tokens = WhitespaceTokenizer().tokenize(example.text) start, end = MitieEntityExtractor.find_entity(entities[0], example.text, tokens) assert start == 4 assert end == 7
def test_repeated_entities(): data = """ { "rasa_nlu_data": { "common_examples" : [ { "text": "book a table today from 3 to 6 for 3 people", "intent": "unk", "entities": [ { "entity": "description", "start": 35, "end": 36, "value": "3" } ] } ] } }""" with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f: f.write(data.encode("utf-8")) f.flush() td = load_data(f.name) assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example.get("entities") assert len(entities) == 1 tokens = WhitespaceTokenizer().tokenize(example.text) start, end = MitieEntityExtractor.find_entity(entities[0], example.text, tokens) assert start == 9 assert end == 10
def tokenizer_from_name(name, language): from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer if name == MitieTokenizer.name: return MitieTokenizer() elif name == SpacyTokenizer.name: import spacy nlp = spacy.load(language, parser=False, entity=False) return SpacyTokenizer(nlp) elif name == WhitespaceTokenizer.name: return WhitespaceTokenizer()
def test_whitespace(): from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer tk = WhitespaceTokenizer() assert [t.text for t in tk.tokenize("Forecast for lunch") ] == ['Forecast', 'for', 'lunch'] assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13] assert [t.text for t in tk.tokenize("hey ńöñàśçií how're you?") ] == ['hey', 'ńöñàśçií', 'how\'re', 'you?'] assert [t.offset for t in tk.tokenize("hey ńöñàśçií how're you?") ] == [0, 4, 13, 20]
def test_whitespace(): from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer tk = WhitespaceTokenizer() assert [t.text for t in tk.tokenize("Forecast for lunch")] == \ ['Forecast', 'for', 'lunch'] assert [t.offset for t in tk.tokenize("Forecast for lunch")] == \ [0, 9, 13] # we ignore .,!? assert [t.text for t in tk.tokenize("hey ńöñàśçií how're you?")] == \ ['hey', 'ńöñàśçií', 'how\'re', 'you'] assert [t.offset for t in tk.tokenize("hey ńöñàśçií how're you?")] == \ [0, 4, 13, 20] assert [t.text for t in tk.tokenize("привет! 10.000, ńöñàśçií. how're you?")] == \ ['привет', '10.000', 'ńöñàśçií', 'how\'re', 'you'] assert [t.offset for t in tk.tokenize("привет! 10.000, ńöñàśçií. how're you?")] == \ [0, 8, 16, 26, 33]
def test_whitespace(): from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer tk = WhitespaceTokenizer() sentence = u"Hi. My name is rasa" assert tk.tokenize(sentence) == [u'Hi.', u'My', u'name', u'is', u'rasa']
class TrainingData(object): def __init__(self, resource_name, backend, language_name): self.intent_examples = [] self.entity_examples = [] self.resource_name = resource_name self.files = util.recursively_find_files(resource_name) self.fformat = self.guess_format(self.files) self.tokenizer = None self.language_name = language_name self.min_examples_per_intent = 2 self.min_examples_per_entity = 2 if backend in ['mitie', 'mitie_sklearn']: from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer self.tokenizer = MITIETokenizer() elif backend in ['spacy_sklearn']: from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer self.tokenizer = SpacyTokenizer(language_name) else: from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer self.tokenizer = WhitespaceTokenizer() warnings.warn( "backend not recognised by TrainingData : defaulting to tokenizing by splitting on whitespace" ) if self.fformat == 'luis': self.load_luis_data(self.files[0]) elif self.fformat == 'wit': self.load_wit_data(self.files[0]) elif self.fformat == 'api': self.load_api_data(self.files) elif self.fformat == 'rasa_nlu': self.load_data(self.files[0]) else: raise ValueError("unknown training file format : {0}".format( self.fformat)) self.validate() def as_json(self, **kwargs): return json.dumps( { "rasa_nlu_data": { "intent_examples": self.intent_examples, "entity_examples": self.entity_examples } }, **kwargs) def guess_format(self, files): for filename in files: filedata = json.loads( codecs.open(filename, encoding='utf-8').read()) if "data" in "data" and type(filedata.get("data")) is list: return 'wit' elif "luis_schema_version" in filedata: return 'luis' elif "userSays" in filedata: return 'api' elif "rasa_nlu_data" in filedata: return 'rasa_nlu' return 'unk' def load_wit_data(self, filename): data = json.loads(codecs.open(filename, encoding='utf-8').read()) for s in data["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0] if intents else 'None' entities = [e for e in entities if ("start" in e and "end" in e)] for e in entities: e["value"] = e["value"][1:-1] self.intent_examples.append({"text": text, "intent": intent}) self.entity_examples.append({ "text": text, "intent": intent, "entities": entities }) def load_luis_data(self, filename): warnings.warn( """LUIS data may not always be correctly imported because entity locations are specified by tokens. If you use a tokenizer which behaves differently from LUIS's your entities might not be correct""" ) data = json.loads(codecs.open(filename, encoding='utf-8').read()) for s in data["utterances"]: text = s.get("text") tokens = [t for t in self.tokenizer.tokenize(text)] intent = s.get("intent") entities = [] for e in s.get("entities") or []: i, ii = e["startPos"], e["endPos"] + 1 _regex = "\s*".join([re.escape(s) for s in tokens[i:ii]]) expr = re.compile(_regex) m = expr.search(text) start, end = m.start(), m.end() val = text[start:end] entities.append({ "entity": e["entity"], "value": val, "start": start, "end": end }) self.intent_examples.append({"text": text, "intent": intent}) self.entity_examples.append({ "text": text, "intent": intent, "entities": entities }) def load_api_data(self, files): for filename in files: data = json.loads(codecs.open(filename, encoding='utf-8').read()) # get only intents, skip the rest. The property name is the target class if "userSays" not in data: continue intent = data.get("name") for s in data["userSays"]: text = "".join([chunk["text"] for chunk in s.get("data")]) # add entities to each token, if available entities = [] for e in [ chunk for chunk in s.get("data") if "alias" in chunk or "meta" in chunk ]: start = text.find(e["text"]) end = start + len(e["text"]) val = text[start:end] entities.append({ "entity": e["alias"] if "alias" in e else e["meta"], "value": val, "start": start, "end": end }) self.intent_examples.append({"text": text, "intent": intent}) self.entity_examples.append({ "text": text, "intent": intent, "entities": entities }) def load_data(self, filename): data = json.loads(open(filename, 'rb').read()) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) self.intent_examples = intent + common self.entity_examples = entity + common def validate(self): examples = sorted(self.intent_examples, key=lambda e: e["intent"]) intentgroups = [] for intent, group in groupby(examples, lambda e: e["intent"]): size = len(list(group)) if size < self.min_examples_per_intent: template = "intent {0} has only {1} training examples! minimum is {2}, training may fail." warnings.warn( template.format(intent, size, self.min_examples_per_intent)) entitygroups = [] examples = sorted( [e for ex in self.entity_examples for e in ex["entities"]], key=lambda e: e["entity"]) for entity, group in groupby(examples, lambda e: e["entity"]): size = len(list(group)) if size < self.min_examples_per_entity: template = "entity {0} has only {1} training examples! minimum is {2}, training may fail." warnings.warn( template.format(entity, size, self.min_examples_per_entity))
def test_whitespace(): from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer tk = WhitespaceTokenizer() assert tk.tokenize("Hi. My name is rasa") == ['Hi.', 'My', 'name', 'is', 'rasa'] assert tk.tokenize("hello ńöñàśçií") == ['hello', 'ńöñàśçií']
class TrainingData(object): # Validation will ensure and warn if these lower limits are not met MIN_EXAMPLES_PER_INTENT = 2 MIN_EXAMPLES_PER_ENTITY = 2 def __init__(self, resource_name, backend, nlp=None, file_format=None): self.intent_examples = [] self.entity_examples = [] self.entity_synonyms = {} self.resource_name = resource_name self.files = TrainingData.resolve_data_files(resource_name) self.fformat = file_format if file_format is not None else TrainingData.guess_format(self.files) self.tokenizer = None self.init_tokenizer(backend, nlp) self.load_data() self.validate() def init_tokenizer(self, backend, nlp): if backend in [mitie.MITIE_BACKEND_NAME, mitie.MITIE_SKLEARN_BACKEND_NAME]: from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer self.tokenizer = MITIETokenizer() elif backend in [spacy.SPACY_BACKEND_NAME]: from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer self.tokenizer = SpacyTokenizer(nlp) else: from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer self.tokenizer = WhitespaceTokenizer() warnings.warn( "backend not recognised by TrainingData : defaulting to tokenizing by splitting on whitespace") @property def num_entity_examples(self): return len([e for e in self.entity_examples if len(e["entities"]) > 0]) @staticmethod def resolve_data_files(resource_name): try: return util.recursively_find_files(resource_name) except ValueError as e: raise ValueError("Invalid training data file / folder specified. " + e.message) def as_json(self, **kwargs): return json.dumps({ "rasa_nlu_data": { "intent_examples": self.intent_examples, "entity_examples": self.entity_examples } }, **kwargs) @staticmethod def guess_format(files): for filename in files: with codecs.open(filename, encoding="utf-8-sig") as f: file_data = json.loads(f.read()) if "data" in file_data and type(file_data.get("data")) is list: return WIT_FILE_FORMAT elif "luis_schema_version" in file_data: return LUIS_FILE_FORMAT elif "userSays" in file_data: return API_FILE_FORMAT elif "rasa_nlu_data" in file_data: return RASA_FILE_FORMAT return UNK_FILE_FORMAT def load_data(self): if self.fformat == LUIS_FILE_FORMAT: self.load_luis_data(self.files[0]) elif self.fformat == WIT_FILE_FORMAT: self.load_wit_data(self.files[0]) elif self.fformat == API_FILE_FORMAT: self.load_api_data(self.files) elif self.fformat == RASA_FILE_FORMAT: self.load_rasa_data(self.files[0]) else: raise ValueError("unknown training file format : {0}".format(self.fformat)) def load_wit_data(self, filename): with codecs.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) for s in data["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0] if intents else 'None' entities = [e for e in entities if ("start" in e and "end" in e)] for e in entities: e["value"] = e["value"][1:-1] # create synonyms dictionary text_value = text[e["start"]:e["end"]] util.add_entities_if_synonyms(self.entity_synonyms, text_value, e["value"]) self.intent_examples.append({"text": text, "intent": intent}) self.entity_examples.append({"text": text, "intent": intent, "entities": entities}) def load_luis_data(self, filename): warnings.warn( """LUIS data may not always be correctly imported because entity locations are specified by tokens. If you use a tokenizer which behaves differently from LUIS's your entities might not be correct""") with codecs.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) for s in data["utterances"]: text = s.get("text") tokens = [t for t in self.tokenizer.tokenize(text)] intent = s.get("intent") entities = [] for e in s.get("entities") or []: i, ii = e["startPos"], e["endPos"] + 1 _regex = u"\s*".join([re.escape(s) for s in tokens[i:ii]]) expr = re.compile(_regex) m = expr.search(text) start, end = m.start(), m.end() val = text[start:end] entities.append({"entity": e["entity"], "value": val, "start": start, "end": end}) self.intent_examples.append({"text": text, "intent": intent}) self.entity_examples.append({"text": text, "intent": intent, "entities": entities}) def load_api_data(self, files): for filename in files: with codecs.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) # get only intents, skip the rest. The property name is the target class if "userSays" in data: intent = data.get("name") for s in data["userSays"]: text = "".join(map(lambda chunk: chunk["text"], s.get("data"))) # add entities to each token, if available entities = [] for e in filter(lambda chunk: "alias" in chunk or "meta" in chunk, s.get("data")): start = text.find(e["text"]) end = start + len(e["text"]) val = text[start:end] entities.append( { "entity": e["alias"] if "alias" in e else e["meta"], "value": val, "start": start, "end": end } ) self.intent_examples.append({"text": text, "intent": intent}) self.entity_examples.append({"text": text, "intent": intent, "entities": entities}) # create synonyms dictionary if "name" in data and "entries" in data: for entry in data["entries"]: if "value" in entry and "synonyms" in entry: for synonym in entry["synonyms"]: util.add_entities_if_synonyms(self.entity_synonyms, synonym, entry["value"]) def load_rasa_data(self, filename): with codecs.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) self.intent_examples = intent + common self.entity_examples = entity + common for example in self.entity_examples: for entity in example["entities"]: entity_val = example["text"][entity["start"]:entity["end"]] util.add_entities_if_synonyms(self.entity_synonyms, entity_val, entity.get("value")) def sorted_entity_examples(self): return sorted([entity for ex in self.entity_examples for entity in ex["entities"]], key=lambda e: e["entity"]) def sorted_intent_examples(self): return sorted(self.intent_examples, key=lambda e: e["intent"]) def validate(self): examples = self.sorted_intent_examples() for intent, group in groupby(examples, lambda e: e["intent"]): size = len(list(group)) if size < self.MIN_EXAMPLES_PER_INTENT: template = u"Intent '{0}' has only {1} training examples! minimum is {2}, training may fail." warnings.warn(template.format(intent, size, self.MIN_EXAMPLES_PER_INTENT)) sorted_entity_examples = self.sorted_entity_examples() for entity, group in groupby(sorted_entity_examples, lambda e: e["entity"]): size = len(list(group)) if size < self.MIN_EXAMPLES_PER_ENTITY: template = u"Entity '{0}' has only {1} training examples! minimum is {2}, training may fail." warnings.warn(template.format(entity, size, self.MIN_EXAMPLES_PER_ENTITY)) for example in self.entity_examples: text = example["text"] text_tokens = self.tokenizer.tokenize(text) for ent in example["entities"]: ent_tokens = self.tokenizer.tokenize(text[ent["start"]:ent["end"]]) for token in ent_tokens: if token not in text_tokens: warnings.warn( "Token '{0}' does not appear in tokenized sentence {1}.".format(token, text_tokens) + "Entities must span whole tokens.")
class TrainingData(object): def __init__(self, resource_name, backend, language_name): self.intent_examples = [] self.entity_examples = [] self.resource_name = resource_name self.files = util.recursively_find_files(resource_name) self.fformat = self.guess_format(self.files) self.tokenizer = None self.language_name = language_name if (backend in ['mitie', 'mitie_sklearn']): from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer self.tokenizer = MITIETokenizer() elif (backend in ['spacy_sklearn']): from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer self.tokenizer = SpacyTokenizer(language_name) else: from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer self.tokenizer = WhitespaceTokenizer() warnings.warn( "backend not recognised by TrainingData : defaulting to tokenizing by splitting on whitespace" ) if (self.fformat == 'luis'): self.load_luis_data(self.files[0]) elif (self.fformat == 'wit'): self.load_wit_data(self.files[0]) elif (self.fformat == 'api'): self.load_api_data(self.files) elif (self.fformat == 'rasa_nlu'): self.load_data(self.files[0]) else: raise ValueError("unknown training file format : {0}".format( self.fformat)) def as_json(self, **kwargs): return json.dumps( { "rasa_nlu_data": { "intent_examples": self.intent_examples, "entity_examples": self.entity_examples } }, **kwargs) def guess_format(self, files): for filename in files: filedata = json.loads( codecs.open(filename, encoding='utf-8').read()) if (filedata.has_key("data") and type(filedata.get("data")) is list): return 'wit' elif (filedata.has_key("luis_schema_version")): return 'luis' elif (filedata.has_key("userSays")): return 'api' elif (filedata.has_key("rasa_nlu_data")): return 'rasa_nlu' return 'unk' def load_wit_data(self, filename): data = json.loads(codecs.open(filename, encoding='utf-8').read()) for s in data["data"]: entities = s.get("entities") if (entities is None): continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0] if intents else 'None' entities = [ e for e in entities if (e.has_key("start") and e.has_key("end")) ] for e in entities: e["value"] = e["value"][1:-1] self.intent_examples.append({"text": text, "intent": intent}) self.entity_examples.append({ "text": text, "intent": intent, "entities": entities }) def load_luis_data(self, filename): warnings.warn( """LUIS data may not always be correctly imported because entity locations are specified by tokens. If you use a tokenizer which behaves differently from LUIS's your entities might not be correct""" ) data = json.loads(codecs.open(filename, encoding='utf-8').read()) for s in data["utterances"]: text = s.get("text") tokens = [t for t in self.tokenizer.tokenize(text)] intent = s.get("intent") entities = [] for e in s.get("entities") or []: i, ii = e["startPos"], e["endPos"] + 1 #print(u"full text: {0}".format(text)) _regex = u"\s*".join([s for s in tokens[i:ii]]) expr = re.compile(_regex) m = expr.search(text) start, end = m.start(), m.end() val = text[start:end] #print(u"entity val : {0}".format(val)) #print(u"match : {0}".format(m.group())) #print(text[start:end]) entities.append({ "entity": e["entity"], "value": val, "start": start, "end": end }) self.intent_examples.append({"text": text, "intent": intent}) self.entity_examples.append({ "text": text, "intent": intent, "entities": entities }) def load_api_data(self, files): for filename in files: data = json.loads(codecs.open(filename, encoding='utf-8').read()) # get only intents, skip the rest. The property name is the target class if "userSays" not in data: continue intent = data.get("name") for s in data["userSays"]: text = "".join(map(lambda chunk: chunk["text"], s.get("data"))) # add entities to each token, if available entities = [] for e in filter( lambda chunk: "alias" in chunk or "meta" in chunk, s.get("data")): start = text.find(e["text"]) end = start + len(e["text"]) val = text[start:end] entities.append({ "entity": e["alias"] if "alias" in e else e["meta"], "value": val, "start": start, "end": end }) self.intent_examples.append({"text": text, "intent": intent}) self.entity_examples.append({ "text": text, "intent": intent, "entities": entities }) def load_data(self, filename): data = json.loads(open(filename, 'rb').read()) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) self.intent_examples = intent + common self.entity_examples = entity + common