def train(self, intent_fst) -> None: """Save examples to JSON file.""" examples_path = self.profile.write_path( self.profile.get("intent.fuzzywuzzy.examples_json")) sentences_by_intent: Dict[str, Any] = make_sentences_by_intent(intent_fst) with open(examples_path, "w") as examples_file: json.dump(sentences_by_intent, examples_file, indent=4) self._logger.debug("Wrote intent examples to %s", examples_path)
def train(self, intent_graph) -> None: """Save examples to JSON file.""" examples_path = self.profile.write_path( self.profile.get("intent.fuzzywuzzy.examples_json")) sentences_by_intent = make_sentences_by_intent( intent_graph, extra_converters=self.converters) with open(examples_path, "w") as examples_file: json.dump(sentences_by_intent, examples_file, indent=4) self._logger.debug("Wrote intent examples to %s", examples_path)
def train(self, intent_fst) -> None: """Run external trainer.""" try: self._logger.debug(self.command) # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] } sentences_by_intent: Dict[str, Any] = make_sentences_by_intent(intent_fst) # JSON -> STDIN json_input = json.dumps({sentences_by_intent}).encode() subprocess.run(self.command, input=json_input, check=True) except Exception: self._logger.exception("train")
def train(self, intent_fst) -> None: """Run external trainer.""" try: self._logger.debug(self.command) # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] } sentences_by_intent = make_sentences_by_intent(intent_fst) json_sentences = { intent: [r.asdict() for r in sentences_by_intent[intent]] for intent in sentences_by_intent } # JSON -> STDIN json_input = json.dumps(json_sentences).encode() subprocess.run(self.command, input=json_input, check=True) except Exception: self._logger.exception("train")
def train(self, intent_fst) -> None: """Train intent classifier and named entity recognizers.""" from flair.data import Sentence, Token from flair.models import SequenceTagger, TextClassifier from flair.embeddings import ( FlairEmbeddings, StackedEmbeddings, DocumentRNNEmbeddings, ) from flair.data import TaggedCorpus from flair.trainers import ModelTrainer # Directory to look for downloaded embeddings cache_dir = self.profile.read_path( self.profile.get("intent.flair.cache_dir", "flair/cache")) os.makedirs(cache_dir, exist_ok=True) # Directory to store generated models data_dir = self.profile.write_path( self.profile.get("intent.flair.data_dir", "flair/data")) if os.path.exists(data_dir): shutil.rmtree(data_dir) self.embeddings = self.profile.get("intent.flair.embeddings", []) assert len(self.embeddings) > 0, "No word embeddings" # Create directories to write training data to class_data_dir = os.path.join(data_dir, "classification") ner_data_dir = os.path.join(data_dir, "ner") os.makedirs(class_data_dir, exist_ok=True) os.makedirs(ner_data_dir, exist_ok=True) # Convert FST to training data # ---------------------------- # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] } sentences_by_intent: Dict[str, Any] = {} # Get sentences for training do_sampling = self.profile.get("intent.flair.do_sampling", True) start_time = time.time() if do_sampling: # Sample from each intent FST num_samples = int( self.profile.get("intent.flair.num_samples", 10000)) intent_map_path = self.profile.read_path( self.profile.get("training.intent.intent_map", "intent_map.json")) with open(intent_map_path, "r") as intent_map_file: intent_map = json.load(intent_map_file) # Gather FSTs for all known intents fsts_dir = self.profile.write_dir( self.profile.get("speech_to_text.fsts_dir")) intent_fst_paths = { intent_id: os.path.join(fsts_dir, f"{intent_id}.fst") for intent_id in intent_map.keys() } # Generate samples self._logger.debug( "Generating %s sample(s) from %s intent(s)", num_samples, len(intent_fst_paths), ) sentences_by_intent = sample_sentences_by_intent( intent_fst_paths, num_samples) else: # Exhaustively generate all sentences self._logger.debug( "Generating all possible sentences (may take a long time)") sentences_by_intent = make_sentences_by_intent(intent_fst) sentence_time = time.time() - start_time self._logger.debug("Generated sentences in %s second(s)", sentence_time) # Get least common multiple in order to balance sentences by intent lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values())) # Generate examples class_sentences = [] ner_sentences: Dict[str, List[Sentence]] = defaultdict(list) for intent_name, intent_sents in sentences_by_intent.items(): num_repeats = max(1, lcm_sentences // len(intent_sents)) for intent_sent in intent_sents: # Only train an intent classifier if there's more than one intent if len(sentences_by_intent) > 1: # Add balanced copies for _ in range(num_repeats): class_sent = Sentence(labels=[intent_name]) for word in intent_sent["tokens"]: class_sent.add_token(Token(word)) class_sentences.append(class_sent) if len(intent_sent["entities"]) == 0: continue # no entities, no sequence tagger # Named entity recognition (NER) example token_idx = 0 entity_start = { ev["start"]: ev for ev in intent_sent["entities"] } entity_end = {ev["end"]: ev for ev in intent_sent["entities"]} entity = None word_tags = [] for word in intent_sent["tokens"]: # Determine tag label tag = "O" if not entity else f"I-{entity}" if token_idx in entity_start: entity = entity_start[token_idx]["entity"] tag = f"B-{entity}" word_tags.append((word, tag)) # word ner token_idx += len(word) + 1 if (token_idx - 1) in entity_end: entity = None # Add balanced copies for _ in range(num_repeats): ner_sent = Sentence() for word, tag in word_tags: token = Token(word) token.add_tag("ner", tag) ner_sent.add_token(token) ner_sentences[intent_name].append(ner_sent) # Start training max_epochs = int(self.profile.get("intent.flair.max_epochs", 100)) # Load word embeddings self._logger.debug("Loading word embeddings from %s", cache_dir) word_embeddings = [ FlairEmbeddings(os.path.join(cache_dir, "embeddings", e)) for e in self.embeddings ] if len(class_sentences) > 0: self._logger.debug("Training intent classifier") # Random 80/10/10 split class_train, class_dev, class_test = self._split_data( class_sentences) class_corpus = TaggedCorpus(class_train, class_dev, class_test) # Intent classification doc_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) classifier = TextClassifier( doc_embeddings, label_dictionary=class_corpus.make_label_dictionary(), multi_label=False, ) self._logger.debug("Intent classifier has %s example(s)", len(class_sentences)) trainer = ModelTrainer(classifier, class_corpus) trainer.train(class_data_dir, max_epochs=max_epochs) else: self._logger.info("Skipping intent classifier training") if len(ner_sentences) > 0: self._logger.debug("Training %s NER sequence tagger(s)", len(ner_sentences)) # Named entity recognition stacked_embeddings = StackedEmbeddings(word_embeddings) for intent_name, intent_ner_sents in ner_sentences.items(): ner_train, ner_dev, ner_test = self._split_data( intent_ner_sents) ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test) tagger = SequenceTagger( hidden_size=256, embeddings=stacked_embeddings, tag_dictionary=ner_corpus.make_tag_dictionary( tag_type="ner"), tag_type="ner", use_crf=True, ) ner_intent_dir = os.path.join(ner_data_dir, intent_name) os.makedirs(ner_intent_dir, exist_ok=True) self._logger.debug( "NER tagger for %s has %s example(s)", intent_name, len(intent_ner_sents), ) trainer = ModelTrainer(tagger, ner_corpus) trainer.train(ner_intent_dir, max_epochs=max_epochs) else: self._logger.info("Skipping NER sequence tagger training")
def train(self, intent_fst) -> None: """Create intents, entities, and keywords.""" # Load "stop" words (common words that are excluded from training) stop_words: Set[str] = set() stop_words_path = self.profile.read_path("stop_words.txt") if os.path.exists(stop_words_path): with open(stop_words_path, "r") as stop_words_file: stop_words = { line.strip() for line in stop_words_file if len(line.strip()) > 0 } # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] } sentences_by_intent: Dict[str, Any] = make_sentences_by_intent(intent_fst) # Generate intent configuration entities: Dict[str, Set[str]] = {} intents: Dict[str, Dict[str, Any]] = {} for intent_name, intent_sents in sentences_by_intent.items(): intent: Dict[str, Any] = { "name": intent_name, "require": [], "optionally": [], } # Track word usage by sentence to determine required vs. optional words word_counts: Dict[str, int] = Counter() entity_counts: Dict[str, int] = Counter() # Process sentences for this intent for intent_sent in intent_sents: _, slots, word_tokens = ( intent_sent.get("raw_text", intent_sent["text"]), intent_sent["entities"], intent_sent["tokens"], ) entity_tokens: Set[str] = set() # Group slot values by entity slot_entities: Dict[str, List[str]] = defaultdict(list) for sent_ent in slots: slot_entities[sent_ent["entity"]].append(sent_ent["value"]) # Add entities for entity_name, entity_values in slot_entities.items(): # Prefix entity name with intent name entity_name = "{0}.{1}".format(intent_name, entity_name) if entity_name not in entities: entities[entity_name] = set() entities[entity_name].update(entity_values) entity_counts[entity_name] += 1 # Split entity values by whitespace for value in entity_values: entity_tokens.update(re.split(r"\s", value)) # Get all non-stop words that are not part of entity values words = set(word_tokens) - entity_tokens - stop_words # Increment count for words for word in words: word_counts[word] += 1 # Decide on required vs. optional for words and entities num_sentences = len(intent_sents) required_words = set() optional_words = set() for word, count in word_counts.items(): assert count <= num_sentences, "Invalid word count" if count == num_sentences: # Word exists in all sentences required_words.add(word) else: # Word only exists in some sentences optional_words.add(word) if len(required_words) > 0: # Create entity for required keywords entity_name = "{0}RequiredKeyword".format(intent_name) entities[entity_name] = required_words intent["require"].append(entity_name) if len(optional_words) > 0: # Create entity for required keywords entity_name = "{0}OptionalKeyword".format(intent_name) entities[entity_name] = optional_words intent["optionally"].append(entity_name) # Add required/optional entities for name, count in entity_counts.items(): assert count <= num_sentences, "Invalid entity count" if count == num_sentences: # Entity exists in all sentences intent["require"].append(name) else: # Entity only exists in some sentences intent["optionally"].append(name) intents[intent_name] = intent # --------------------------------------------------------------------- # Write configuration file config = { "intents": intents, # Convert sets to lists because JSON serializer is super whiny "entities": {name: list(values) for name, values in entities.items()}, } config_path = self.profile.write_path("adapt_config.json") with open(config_path, "w") as config_file: json.dump(config, config_file, indent=4) self._logger.debug("Wrote adapt configuration to %s", config_path)
def train(self, intent_graph) -> None: """Convert examples to Markdown and POST to RasaNLU server.""" import requests # Load settings language = self.profile.get("language", "en") rasa_config = self.profile.get("intent.rasa", {}) url = rasa_config.get("url", "http://localhost:5005") project_name = rasa_config.get("project_name", "rhasspy") # Create markdown examples examples_md_path = self.profile.write_path( rasa_config.get("examples_markdown", "intent_examples.md")) # Build Markdown sentences sentences_by_intent = make_sentences_by_intent( intent_graph, extra_converters=self.converters) # Write to YAML/Markdown file with open(examples_md_path, "w") as examples_md_file: for intent_name, intent_sents in sentences_by_intent.items(): # Rasa Markdown training format print(f"## intent:{intent_name}", file=examples_md_file) for intent_sent in intent_sents: raw_index = 0 index_entity = { e["raw_start"]: e for e in intent_sent["entities"] } entity = None sentence_tokens = [] entity_tokens = [] for token in intent_sent["raw_tokens"]: if entity and (raw_index >= entity["raw_end"]): # Finish current entity last_token = entity_tokens[-1] entity_tokens[ -1] = f"{last_token}]({entity['entity']})" sentence_tokens.extend(entity_tokens) entity = None entity_tokens = [] new_entity = index_entity.get(raw_index) if new_entity: # Begin new entity assert entity is None, "Unclosed entity" entity = new_entity entity_tokens = [] token = f"[{token}" if entity: # Add to current entity entity_tokens.append(token) else: # Add directly to sentence sentence_tokens.append(token) raw_index += len(token) + 1 if entity: # Finish final entity last_token = entity_tokens[-1] entity_tokens[ -1] = f"{last_token}]({entity['entity']})" sentence_tokens.extend(entity_tokens) # Print single example print("-", " ".join(sentence_tokens), file=examples_md_file) # Newline between intents print("", file=examples_md_file) # Create training YAML file with tempfile.NamedTemporaryFile(suffix=".json", mode="w+", delete=False) as training_file: training_config = StringIO() training_config.write(f'language: "{language}"\n') training_config.write('pipeline: "pretrained_embeddings_spacy"\n') # Write markdown directly into YAML. # Because reasons. with open(examples_md_path, "r") as examples_md_file: blank_line = False for line in examples_md_file: line = line.strip() if line: if blank_line: print("", file=training_file) blank_line = False print(f" {line}", file=training_file) else: blank_line = True # Do training via HTTP API training_url = urljoin(url, "model/train") training_file.seek(0) with open(training_file.name, "rb") as training_data: training_body = { "config": training_config.getvalue(), "nlu": training_data.read().decode("utf-8"), } training_config.close() response = requests.post( training_url, data=json.dumps(training_body), params=json.dumps({"project": project_name}), headers={"Content-Type": "application/json"}, ) self._logger.debug("POSTed training data to %s", training_url) try: response.raise_for_status() model_dir = rasa_config.get("model_dir", "") model_file = os.path.join(model_dir, response.headers["filename"]) self._logger.debug("Received model %s", model_file) # Replace model model_url = urljoin(url, "model") requests.put(model_url, json={"model_file": model_file}) except Exception: # Rasa gives quite helpful error messages, so extract them from the response. raise Exception( f'{response.reason}: {json.loads(response.content)["message"]}' )