async def load_data( resource_name: Union[Text, "TrainingDataImporter"], domain: "Domain", remove_duplicates: bool = True, unique_last_num_states: Optional[int] = None, augmentation_factor: int = 50, tracker_limit: Optional[int] = None, use_story_concatenation: bool = True, debug_plots: bool = False, exclusion_percentage: Optional[int] = None, ) -> List["DialogueStateTracker"]: """ Load training data from a resource. Args: resource_name: resource to load the data from. either a path or an importer domain: domain used for loading remove_duplicates: should duplicated training examples be removed? unique_last_num_states: number of states in a conversation that make the a tracker unique (this is used to identify duplicates) augmentation_factor: by how much should the story training data be augmented tracker_limit: maximum number of trackers to generate during augmentation use_story_concatenation: should stories be concatenated when doing data augmentation debug_plots: generate debug plots during loading exclusion_percentage: how much data to exclude Returns: list of loaded trackers """ from rasa.core.training.generator import TrainingDataGenerator from rasa.importers.importer import TrainingDataImporter if resource_name: if isinstance(resource_name, TrainingDataImporter): graph = await resource_name.get_stories( exclusion_percentage=exclusion_percentage) else: graph = await extract_story_graph( resource_name, domain, exclusion_percentage=exclusion_percentage) g = TrainingDataGenerator( graph, domain, remove_duplicates, unique_last_num_states, augmentation_factor, tracker_limit, use_story_concatenation, debug_plots, ) return g.generate() else: return []
async def load_data( resource_name: Text, domain: "Domain", remove_duplicates: bool = True, unique_last_num_states: Optional[int] = None, augmentation_factor: int = 20, tracker_limit: Optional[int] = None, use_story_concatenation: bool = True, debug_plots=False, exclusion_percentage: int = None, ) -> List["DialogueStateTracker"]: from rasa.core.training import extract_story_graph from rasa.core.training.generator import TrainingDataGenerator if resource_name: graph = await extract_story_graph( resource_name, domain, exclusion_percentage=exclusion_percentage) g = TrainingDataGenerator( graph, domain, remove_duplicates, unique_last_num_states, augmentation_factor, tracker_limit, use_story_concatenation, debug_plots, ) return g.generate() else: return []
async def visualize_stories( story_steps: List[StoryStep], domain: Domain, output_file: Optional[Text], max_history: int, interpreter: NaturalLanguageInterpreter = RegexInterpreter(), nlu_training_data: Optional["TrainingData"] = None, should_merge_nodes: bool = True, fontsize: int = 12, silent: bool = False): """Given a set of stories, generates a graph visualizing the flows in the stories. Visualization is always a trade off between making the graph as small as possible while at the same time making sure the meaning doesn't change to "much". The algorithm will compress the graph generated from the stories to merge nodes that are similar. Hence, the algorithm might create paths through the graph that aren't actually specified in the stories, but we try to minimize that. Output file defines if and where a file containing the plotted graph should be stored. The history defines how much 'memory' the graph has. This influences in which situations the algorithm will merge nodes. Nodes will only be merged if they are equal within the history, this means the larger the history is we take into account the less likely it is we merge any nodes. The training data parameter can be used to pass in a Rasa NLU training data instance. It will be used to replace the user messages from the story file with actual messages from the training data.""" story_graph = StoryGraph(story_steps) g = TrainingDataGenerator(story_graph, domain, use_story_concatenation=False, tracker_limit=100, augmentation_factor=0) completed_trackers = g.generate(silent) event_sequences = [t.events for t in completed_trackers] graph = await visualize_neighborhood(None, event_sequences, output_file, max_history, interpreter, nlu_training_data, should_merge_nodes, max_distance=1, fontsize=fontsize) return graph
async def train(self): """Train the engine. """ nltk.download('punkt') lang = self.config['language'] if not os.path.exists('data/' + self.config['skill-id']): _LOGGER.info("Starting Skill training.") _LOGGER.info("Generating stories.") data, domain_data, stories = await GenerateStories.run( self.config['skill-id'], self.config['language'], self.asm) training_data = TrainingData(training_examples=data) nlu_config = RasaNLUModelConfig({ "language": lang, "pipeline": self.config['pipeline'], "data": None }) trainer = Trainer(nlu_config, None, True) _LOGGER.info("Training Arcus NLU") trainer.train(training_data) trainer.persist("data/" + self.config['skill-id'], None, 'nlu') # Rasa core domain = Domain.from_dict(domain_data) reader = StoryFileReader(domain, RegexInterpreter(), None, False) story_steps = await reader.process_lines(stories) graph = StoryGraph(story_steps) g = TrainingDataGenerator( graph, domain, remove_duplicates=True, unique_last_num_states=None, augmentation_factor=20, tracker_limit=None, use_story_concatenation=True, debug_plots=False, ) training_trackers = g.generate() policy_list = SimplePolicyEnsemble.from_dict( {"policies": self.config['policies']}) policy_ensemble = SimplePolicyEnsemble(policy_list) _LOGGER.info("Training Arcus Core") policy_ensemble.train(training_trackers, domain) policy_ensemble.persist( "data/" + self.config['skill-id'] + "/core", False) domain.persist("data/" + self.config['skill-id'] + "/core/model") domain.persist_specification("data/" + self.config['skill-id'] + "/core")
async def _generate_trackers(resource_name, agent, max_stories=None, use_e2e=False): from rasa.core.training.generator import TrainingDataGenerator from rasa.core import training story_graph = await training.extract_story_graph( resource_name, agent.domain, agent.interpreter, use_e2e) g = TrainingDataGenerator(story_graph, agent.domain, use_story_concatenation=False, augmentation_factor=0, tracker_limit=max_stories) return g.generate()
def verify_story_structure(self, ignore_warnings: bool = True, max_history: Optional[int] = None) -> bool: """Verifies that the bot behaviour in stories is deterministic. Args: ignore_warnings: When `True`, return `True` even if conflicts were found. max_history: Maximal number of events to take into account for conflict identification. Returns: `False` is a conflict was found and `ignore_warnings` is `False`. `True` otherwise. """ logger.info("Story structure validation...") trackers = TrainingDataGenerator( self.story_graph, domain=self.domain, remove_duplicates=False, augmentation_factor=0, ).generate() # Create a list of `StoryConflict` objects conflicts = rasa.core.training.story_conflict.find_story_conflicts( trackers, self.domain, max_history) if not conflicts: logger.info("No story structure conflicts found.") else: for conflict in conflicts: logger.warning(conflict) return ignore_warnings or not conflicts
async def _generate_trackers( resource_name: Text, agent: "Agent", max_stories: Optional[int] = None, use_e2e: bool = False, ) -> List[Any]: from rasa.core.training.generator import TrainingDataGenerator from rasa.core import training story_graph = await training.extract_story_graph(resource_name, agent.domain, use_e2e) g = TrainingDataGenerator( story_graph, agent.domain, use_story_concatenation=False, augmentation_factor=0, tracker_limit=max_stories, ) return g.generate_story_trackers()
async def _setup_trackers_for_testing( domain_path: Text, training_data_file: Text ) -> Tuple[List[TrackerWithCachedStates], Domain]: importer = RasaFileImporter(domain_path=domain_path, training_data_paths=[training_data_file]) validator = await Validator.from_importer(importer) trackers = TrainingDataGenerator( validator.story_graph, domain=validator.domain, remove_duplicates=False, augmentation_factor=0, ).generate() return trackers, validator.domain