def task(source, dest, cred_file, percent): # load Rasa NLU training data r = MarkdownReader() with open(source, "r") as fin: nlu = fin.read() nlu_train = r.reads(nlu) translate_client = translate.Client.from_service_account_json(cred_file) def trans(text): trans_text = translate_client.translate(text, source_language="en", target_language="zh-TW") logger.info(u'origin: {}, translated: {}'.format( example.text, trans_text['translatedText'])) return trans_text['translatedText'] nlu_train.training_examples = random_select_samples( nlu_train.training_examples, percent) for example in nlu_train.training_examples: example.text = trans(example.text) if example.get("entities"): for entity in example.get("entities"): entity["value"] = trans(entity['value']) # Generate Rasa NLU translated training data w = MarkdownWriter() w.dump(dest, nlu_train)
def test_dump_nlu_with_responses(): md = """## intent:greet - hey - howdy - hey there - hello - hi - good morning - good evening - dear sir ## intent:chitchat/ask_name - What's your name? - What can I call you? ## intent:chitchat/ask_weather - How's the weather? - Is it too hot outside? """ r = MarkdownReader() nlu_data = r.reads(md) dumped = nlu_data.nlu_as_markdown() assert dumped == md
def _write_nlu_yaml(training_data_path: Path, output_path: Path, source_path: Path) -> None: reader = MarkdownReader() writer = RasaYAMLWriter() training_data = reader.read(training_data_path) writer.dump(output_path, training_data) print_success(f"Converted NLU file: '{source_path}' >> '{output_path}'.")
def test_markdown_entity_regex(): r = MarkdownReader() md = """ ## intent:restaurant_search - i'm looking for a place to eat - i'm looking for a place in the [north](loc-direction) of town - show me [chines](cuisine:chinese) restaurants - show me [chines](22_ab-34*3.A:43er*+?df) restaurants """ result = r.reads(md) assert len(result.training_examples) == 4 first = result.training_examples[0] assert first.data == {"intent": "restaurant_search"} assert first.text == "i'm looking for a place to eat" second = result.training_examples[1] assert second.data == { "intent": "restaurant_search", "entities": [{ "start": 31, "end": 36, "value": "north", "entity": "loc-direction" }], } assert second.text == "i'm looking for a place in the north of town" third = result.training_examples[2] assert third.data == { "intent": "restaurant_search", "entities": [{ "start": 8, "end": 14, "value": "chinese", "entity": "cuisine" }], } assert third.text == "show me chines restaurants" fourth = result.training_examples[3] assert fourth.data == { "intent": "restaurant_search", "entities": [{ "start": 8, "end": 14, "value": "43er*+?df", "entity": "22_ab-34*3.A" }], } assert fourth.text == "show me chines restaurants"
def read_inputs_md(input_path): reader = MarkdownReader() reader.read(input_path, language='de', fformat='MARKDOWN') texts = [] cats = [] for message in reader.training_examples: texts.append(message.text) cats.append(message.get('intent')) return texts, cats
def test_check_check_correct_entity_annotations(text: Text, warnings: int): reader = MarkdownReader() tokenizer = WhitespaceTokenizer() training_data = reader.reads(text) tokenizer.train(training_data) with pytest.warns(UserWarning) as record: EntityExtractor.check_correct_entity_annotations(training_data) assert len(record) == warnings assert all([excerpt in record[0].message.args[0]] for excerpt in ["Misaligned entity annotation in sentence"])
def test_markdown_entity_regex(example: Text, expected_num_entities: int): r = MarkdownReader() md = f""" ## intent:test-intent - {example} """ result = r.reads(md) assert len(result.training_examples) == 1 actual_example = result.training_examples[0] assert actual_example.data["intent"] == "test-intent" assert len(actual_example.data.get("entities", [])) == expected_num_entities
def test_markdown_order(): r = MarkdownReader() md = """## intent:z - i'm looking for a place to eat - i'm looking for a place in the [north](loc-direction) of town ## intent:a - intent a - also very important """ training_data = r.reads(md) assert training_data.nlu_as_markdown() == md
def test_deprecation_warning_logged(): r = MarkdownReader() md = """ ## intent:test-intent - I want to go to [LA](city:Los Angeles) """ with pytest.warns( FutureWarning, match= r"You are using the deprecated training data format to declare " r"synonyms.*", ): r.reads(md)
def test_markdown_unespace_tokens(): r = MarkdownReader() md = """## intent:test-intent - Hi \\t Can you help me?\\n I want to go to [Alexandria]{"entity": "city"} """ expected_num_entities = 1 training_data = r.reads(md) assert len(training_data.training_examples) == 1 actual_example = training_data.training_examples[0] assert actual_example.data["intent"] == "test-intent" assert len(actual_example.data.get("entities", [])) == expected_num_entities
def _convert_to_yaml(args: argparse.Namespace, is_nlu: bool) -> None: output = Path(args.out) if not os.path.exists(output): print_error_and_exit( f"The output path '{output}' doesn't exist. Please make sure to specify " f"an existing directory and try again." ) training_data = Path(args.data) if not os.path.exists(training_data): print_error_and_exit( f"The training data path {training_data} doesn't exist " f"and will be skipped." ) num_of_files_converted = 0 for file in os.listdir(training_data): source_path = training_data / file output_path = output / f"{source_path.stem}{CONVERTED_FILE_SUFFIX}" if MarkdownReader.is_markdown_nlu_file(source_path): if not is_nlu: continue _write_nlu_yaml(source_path, output_path, source_path) num_of_files_converted += 1 elif not is_nlu and MarkdownStoryReader.is_markdown_story_file(source_path): _write_core_yaml(source_path, output_path, source_path) num_of_files_converted += 1 else: print_warning(f"Skipped file: '{source_path}'.") print_info(f"Converted {num_of_files_converted} file(s), saved in '{output}'.")
def _reader_factory(fformat: Text) -> Optional["TrainingDataReader"]: """Generates the appropriate reader class based on the file format.""" from rasa.nlu.training_data.formats import ( MarkdownReader, WitReader, LuisReader, RasaReader, DialogflowReader, NLGMarkdownReader, ) reader = None if fformat == LUIS: reader = LuisReader() elif fformat == WIT: reader = WitReader() elif fformat in DIALOGFLOW_RELEVANT: reader = DialogflowReader() elif fformat == RASA: reader = RasaReader() elif fformat == MARKDOWN: reader = MarkdownReader() elif fformat == MARKDOWN_NLG: reader = NLGMarkdownReader() return reader
def test_markdown_entity_regex(): r = MarkdownReader() md = """ ## intent:restaurant_search - i'm looking for a place to eat - i'm looking for a place in the [north](loc-direction) of town - show me [chines](cuisine:chinese) restaurants - show me [chines](22_ab-34*3.A:43er*+?df) restaurants """ result = r.reads(md) assert len(result.training_examples) == 4 first = result.training_examples[0] assert first.data == {"intent": "restaurant_search"} assert first.text == "i'm looking for a place to eat" second = result.training_examples[1] assert second.data == {'intent': 'restaurant_search', 'entities': [ {'start': 31, 'end': 36, 'value': 'north', 'entity': 'loc-direction'} ]} assert second.text == "i'm looking for a place in the north of town" third = result.training_examples[2] assert third.data == {'intent': 'restaurant_search', 'entities': [ {'start': 8, 'end': 14, 'value': 'chinese', 'entity': 'cuisine'}]} assert third.text == "show me chines restaurants" fourth = result.training_examples[3] assert fourth.data == {'intent': 'restaurant_search', 'entities': [ {'start': 8, 'end': 14, 'value': '43er*+?df', 'entity': '22_ab-34*3.A'}]} assert fourth.text == "show me chines restaurants"
def test_markdown_entity_regex( example: Text, expected_entities: Optional[List[Dict[Text, Any]]], expected_text: Text, ): r = MarkdownReader() md = f""" ## intent:test-intent - {example} """ result = r.reads(md) assert len(result.training_examples) == 1 actual_example = result.training_examples[0] assert actual_example.data["intent"] == "test-intent" assert actual_example.data.get("entities") == expected_entities assert actual_example.text == expected_text
def md_format_message(text, intent, entities): from rasa.nlu.training_data.formats import MarkdownWriter, MarkdownReader message_from_md = MarkdownReader()._parse_training_example(text) deserialised_entities = deserialise_entities(entities) return MarkdownWriter()._generate_message_md( { "text": message_from_md.text, "intent": intent, "entities": deserialised_entities, } )
def md_format_message(text, intent, entities) -> Text: from rasa.nlu.training_data.formats import MarkdownReader from rasa.nlu.training_data.formats.readerwriter import TrainingDataWriter message_from_md = MarkdownReader().parse_training_example(text) deserialised_entities = deserialise_entities(entities) return TrainingDataWriter.generate_message( { "text": message_from_md.text, "intent": intent, "entities": deserialised_entities, } )
async def replace_placeholders(self, example: Message, faker_: Faker, matches: List[Tuple[Any, ...]], count: int) -> AsyncIterator[Message]: original_text = await self.rebuild_original_text(example) for _ in range(count): text = await self.replace_placeholders_in_text( example.text, faker_, matches) original_text = await self.replace_placeholders_in_text( original_text, faker_, matches) entities = MarkdownReader._find_entities_in_training_example( original_text) new_message = Message.build(text, example.get("intent"), entities) yield new_message
async def _correct_entities(latest_message: Dict[Text, Any], endpoint: EndpointConfig, sender_id: Text) -> List[Dict[Text, Any]]: """Validate the entities of a user message. Returns the corrected entities""" from rasa.nlu.training_data.formats import MarkdownReader parse_original = latest_message.get("parse_data", {}) entity_str = _as_md_message(parse_original) question = questionary.text( "Please mark the entities using [value](type) notation", default=entity_str) annotation = await _ask_questions(question, sender_id, endpoint) # noinspection PyProtectedMember parse_annotated = MarkdownReader()._parse_training_example(annotation) corrected_entities = _merge_annotated_and_original_entities( parse_annotated, parse_original) return corrected_entities
from rasa.nlu.training_data.formats import MarkdownReader import xlsxwriter workbook = xlsxwriter.Workbook('filename.xlsx') worksheet = workbook.add_worksheet() worksheet.write('A1', 'question') worksheet.write('B1', 'label') worksheet.write('C1', 'answer') training_data = () row = 1 col = 0 doc = "PATH\\TO\\nlu.md" reader = MarkdownReader() reader.read(doc, language='de', fformat='MARKDOWN') for message in reader.training_examples: training_data = training_data + ([message.text, message.get('intent')], ) for question, label in (training_data): worksheet.write_string(row, col, question) worksheet.write_string(row, col + 1, label) worksheet.write_string(row, col + 2, '') row += 1 workbook.close()