def parse(self): print_DBG("Parsing file: " + self.in_file.name) line = None while line != "": line = self.read_line() stripped_line = line.lstrip() line_type = pu.get_top_level_line_type(line, stripped_line) if line_type is None: raise SyntaxError("Invalid top-level line", (self.in_file.name, 0, 0, stripped_line)) elif line_type == pu.LineType.empty or line_type == pu.LineType.comment: continue stripped_line = pu.strip_comments(stripped_line) # Stripping was not done before to compute the indentation if line_type == pu.LineType.include_file: self.parse_file(stripped_line[1:].rstrip()) elif line_type == pu.LineType.alias_declaration: self.parse_alias_definition(stripped_line) elif line_type == pu.LineType.slot_declaration: self.parse_slot_definition(stripped_line) else: # intent declaration self.parse_intent_definition(stripped_line) print_DBG("Parsing of file: " + self.in_file.name + " finished") self.parsing_finished = True
def run_generation(self, adapter_str=None): """" Runs the generation of all intents and writes them out to the output file(s) using the adapter `adapter` if one is provided. @pre: the parsing has been done. """ if adapter_str is None: adapter = self.adapter else: adapter = adapter_factory.create_adapter(adapter_str) self.generator = Generator() synonyms = AST.get_or_create().get_entities_synonyms() if os.path.exists(self.output_dir_path): if self.force_overwriting or self._ask_confirmation(): shutil.rmtree(self.output_dir_path) else: print_DBG("Aborting generation. Exiting without any change.") return train_examples = list(self.generator.generate_train()) if train_examples: adapter.write(os.path.join(self.output_dir_path, "train"), train_examples, synonyms) test_examples = list(self.generator.generate_test(train_examples)) if test_examples: adapter.write(os.path.join(self.output_dir_path, "test"), test_examples, synonyms) print_DBG("Generation over")
def generate_train(self): print_DBG("Generating training examples...") for intent_name in self.parser.intent_definitions: intent = self.parser.intent_definitions[intent_name] examples = intent.generate(self.max_nb_single_intent_examples) for example in examples: yield example
def run_generation(self, adapter_str=None): """" Runs the generation of all intents and writes them out to the output file(s) using the adapter `adapter` if one is provided. @pre: the parsing has been done. """ if adapter_str is None: adapter = self.adapter else: adapter = adapter_factory.create_adapter(adapter_str) self.generator = Generator(self.parser) synonyms = self.generator.get_entities_synonyms() if os.path.exists(self.output_dir_path): shutil.rmtree(self.output_dir_path) train_examples = list(self.generator.generate_train()) if train_examples: adapter.write(os.path.join(self.output_dir_path, "train"), train_examples, synonyms) test_examples = list(self.generator.generate_test(train_examples)) if test_examples: adapter.write(os.path.join(self.output_dir_path, "test"), test_examples, synonyms) print_DBG("Generation over")
def generate_train(self): print_DBG("Generating training examples...") intent_definitions = self.ast[UnitType.intent] for intent_name in intent_definitions: intent = intent_definitions[intent_name] examples = intent.generate_train() for example in examples: yield example
def _parse_file_inclusion(self, lexical_tokens): """ Opens the file that is included by the tokenized line `lexical_tokens`. @pre: `lexical_tokens` contain a tokenized file inclusion line. """ self.open_new_file(lexical_tokens[1].text) print_DBG( "Parsing file: " + \ self.input_file_manager.get_current_file_name() )
def parse_file(self, file_path): """ Parses the template file(s) at `file_path` and translates them into an AST. """ self.open_new_file(file_path) print_DBG( "Parsing file: " + \ self.input_file_manager.get_current_file_name() ) while True: line = self.input_file_manager.read_line() if line is None: # End of file break currently_parsing_slot = ( self._current_unit_declaration is not None and self._current_unit_declaration.unit_type == UnitType.slot) lexical_tokens = self.lexer.lex(line, currently_parsing_slot) lexical_tokens = remove_comment_tokens(lexical_tokens) if len(lexical_tokens) == 0: continue if lexical_tokens[0].type == TerminalType.file_inclusion_marker: self._parse_file_inclusion(lexical_tokens) self._declaration_line_allowed = True self._last_indentation = None self._current_unit_declaration = None self._current_variation_name = None elif lexical_tokens[0].type == TerminalType.indentation: self._parse_rule_line(lexical_tokens) self._declaration_line_allowed = True self._last_indentation = lexical_tokens[0].text elif ( lexical_tokens[0].type in \ (TerminalType.alias_decl_start, TerminalType.slot_decl_start, TerminalType.intent_decl_start) ): self._parse_unit_declaration_line(lexical_tokens) self._declaration_line_allowed = False self._last_indentation = None else: self.input_file_manager.syntax_error( "Couldn't parse this line: a line can be either " + \ "an empty line, a comment line, a file inclusion line, " + \ "a unit declaration or a rule." )
def generate_test(self, training_examples=None): should_generate_test_set = False for intent_name in self.parser.intent_definitions: if self.parser.intent_definitions[intent_name].nb_testing_examples_asked is not None: should_generate_test_set = True break if should_generate_test_set: print_DBG("Generating testing examples...") for intent_name in self.parser.intent_definitions: intent = self.parser.intent_definitions[intent_name] examples = intent.generate(self.max_nb_single_intent_examples, training_examples) for example in examples: yield example
def generate_test(self, training_examples=None): should_generate_test_set = False intent_definitions = self.ast[UnitType.intent] for intent_name in intent_definitions: if ( intent_definitions[intent_name].get_nb_testing_examples_asked \ is not None ): should_generate_test_set = True break if should_generate_test_set: print_DBG("Generating testing examples...") for intent_name in intent_definitions: intent = intent_definitions[intent_name] examples = intent.generate_test(training_examples) for example in examples: yield example
def parse(self): """ Parses the master file and subsequent files and transforms the information parsed into a dictionary of declaration names -> rules. """ print_DBG("Parsing master file: " + self.tokenizer.get_file_information()[0]) for token_line in self.tokenizer.next_tokenized_line(): if not token_line[0].isspace(): if token_line[0] == pu.INCLUDE_FILE_SYM: self.tokenizer.open_file(token_line[1]) print_DBG("Parsing file: " + self.tokenizer.get_file_information()[0]) self.stats["#files"] += 1 else: self._parse_declaration_initiator(token_line) self._expecting_rule = True self.stats["#declarations"] += 1 self._expected_indentation = None else: self._parse_rule(token_line) self._expecting_rule = False # Not expecting but still allowed self.stats["#rules"] += 1 self.tokenizer.close_files() print_DBG("Parsing finished!")
def main(): # pylint: disable=bad-continuation argument_parser = argparse.ArgumentParser( description="Chatette v" + __version__ + " -- " + "Generates NLU datasets from template files", epilog="SimGus -- 2018 -- Released under MIT license", prog="Chatette", add_help=True) argument_parser.add_argument("input", type=str, help="Path to master template file") argument_parser.add_argument("-o", "--out", dest="output", required=False, type=str, default=None, help="Output directory path") argument_parser.add_argument("-s", "--seed", dest="seed", required=False, type=str, default=None, help="Seed for the random generator " + "(any string without spaces will work)") argument_parser.add_argument("-l", "--local", dest="local", required=False, action="store_true", default=False, help="Change the base directory for output " + "files from the current working directory " + "to the directory containing the template " + "file") argument_parser.add_argument("-a", "--adapter", dest="adapter", required=False, type=str, default="rasa", help="Write adapter. Possible values: " + "['rasa', 'jsonl']") argument_parser.add_argument("-v", "--version", action="version", version="%(prog)s v" + __version__, help="Print the version number of the module") if len(sys.argv[1:]) == 0: argument_parser.print_help() argument_parser.exit() args = argument_parser.parse_args() template_file_path = args.input if args.local: dir_path = os.path.dirname(template_file_path) else: dir_path = os.getcwd() if args.output is None: dir_path = os.path.join(dir_path, "output") else: dir_path = os.path.join(dir_path, args.output) # Initialize the random number generator if args.seed is not None: random_seed(args.seed) with io.open(template_file_path, 'r') as in_file: parser = Parser(in_file) parser.parse() # parser.print_DBG() if args.adapter == 'rasa': # pylint: disable=redefined-variable-type adapter = RasaAdapter() elif args.adapter == 'jsonl': # pylint: disable=redefined-variable-type adapter = JsonListAdapter() else: raise ValueError("Unknown adapter was selected") generator = Generator(parser) synonyms = generator.get_entities_synonyms() train_examples = list(generator.generate_train()) if train_examples: adapter.write(os.path.join(dir_path, "train"), train_examples, synonyms) test_examples = list(generator.generate_test(train_examples)) if test_examples: adapter.write(os.path.join(dir_path, "test"), test_examples, synonyms) print_DBG("Generation over")