def test_invalid_pipeline_template(): args = {"pipeline": "my_made_up_name"} f = write_file_config(args) with pytest.raises(config.InvalidConfigError) as execinfo: config.load(f.name) assert "unknown pipeline template" in str(execinfo.value)
def test_invalid_config_json(): file_config = """pipeline: [pretrained_embeddings_spacy""" # invalid yaml with tempfile.NamedTemporaryFile("w+", suffix="_tmp_config_file.json") as f: f.write(file_config) f.flush() with pytest.raises(config.InvalidConfigError): config.load(f.name)
def test_invalid_config_json(tmp_path): file_config = """pipeline: [pretrained_embeddings_spacy""" # invalid yaml f = tmp_path / "tmp_config_file.json" f.write_text(file_config) with pytest.raises(config.InvalidConfigError): config.load(str(f))
def train_eval_rasa_nlu_model(lang='en', cross=False, save=''): """ Train snips data from all brat annotation object :param lang: abbreviate language name :param save: path where model will be save :rtype: None """ from rasa.nlu.training_data import load_data from rasa.nlu.model import Trainer from rasa.nlu.components import ComponentBuilder from rasa.nlu import config from rasa.nlu.test import run_evaluation config_file = source_config / "config_rasa_converrt.yml" if cross: filename_results = source_result / "rasa_cross_semeval_2020_model_task1_{}".format(save) train_data_obj = BuildSnipsDataTask1(lang, cross=cross, vers=save) train_data = train_data_obj.build_rasa_data_task1() training_data = load_data(str(train_data[0])) builder = ComponentBuilder(use_cache=True) trainer = Trainer(config.load(str(config_file)), builder) print("--> Training patent data with Rasa...") trainer.train(training_data, num_threads=8, n_jobs=-1, verbose=True) print("--> Saving model trained with Rasa (Rasa)...") model_directory = trainer.persist(filename_results) print("--> Evaluating training data with Rasa metrics (Cross-validation)...") import os from datetime import datetime filename_test = str(train_data[1]) print(filename_test) dmtime = "test_{}_{}".format(save, datetime.now().strftime("%Y%m%d-%H%M%S")) out_test = source_result / "rasa_cross_evaluation_task1" / dmtime model_directory = sorted(filename_results.glob("nlu_*"), key=os.path.getmtime)[-1] run_evaluation(filename_test, str(model_directory), output_directory=str(out_test)) else: filename_results = source_result / "rasa_semeval_2020_model_task1_{}".format(save) train_data_obj = BuildSnipsDataTask1(lang, cross=cross, vers=save) train_file = train_data_obj.build_rasa_data_task1() training_data = load_data(train_file) builder = ComponentBuilder(use_cache=True) trainer = Trainer(config.load(str(config_file)), builder) print("--> Training patent data with Rasa...") trainer.train(training_data, num_threads=8, verbose=True, n_jobs=-1, fixed_model_name="nlu") print("--> Saving model trained with Rasa (Rasa)...") model_directory = trainer.persist(filename_results)
def test_set_attr_on_component(default_config): cfg = config.load("sample_configs/config_pretrained_embeddings_spacy.yml") cfg.set_component_attr(6, C=324) assert cfg.for_component(1) == {"name": "SpacyTokenizer"} assert cfg.for_component(6) == {"name": "SklearnIntentClassifier", "C": 324}
def test_run_cv_evaluation(): td = training_data.load_data("data/examples/rasa/demo-rasa.json") nlu_config = config.load( "sample_configs/config_pretrained_embeddings_spacy.yml") n_folds = 2 intent_results, entity_results = cross_validate(td, n_folds, nlu_config) assert len(intent_results.train["Accuracy"]) == n_folds assert len(intent_results.train["Precision"]) == n_folds assert len(intent_results.train["F1-score"]) == n_folds assert len(intent_results.test["Accuracy"]) == n_folds assert len(intent_results.test["Precision"]) == n_folds assert len(intent_results.test["F1-score"]) == n_folds assert len( entity_results.train["CRFEntityExtractor"]["Accuracy"]) == n_folds assert len( entity_results.train["CRFEntityExtractor"]["Precision"]) == n_folds assert len( entity_results.train["CRFEntityExtractor"]["F1-score"]) == n_folds assert len( entity_results.test["CRFEntityExtractor"]["Accuracy"]) == n_folds assert len( entity_results.test["CRFEntityExtractor"]["Precision"]) == n_folds assert len( entity_results.test["CRFEntityExtractor"]["F1-score"]) == n_folds
def train(nlu_config: Union[Text, RasaNLUModelConfig], data: Text, path: Optional[Text] = None, project: Optional[Text] = None, fixed_model_name: Optional[Text] = None, storage: Optional[Text] = None, component_builder: Optional[ComponentBuilder] = None, training_data_endpoint: Optional[EndpointConfig] = None, **kwargs: Any) -> Tuple[Trainer, Interpreter, Text]: """Loads the trainer and the data and runs the training of the model.""" if isinstance(nlu_config, str): nlu_config = config.load(nlu_config) # Ensure we are training a model that we can save in the end # WARN: there is still a race condition if a model with the same name is # trained in another subprocess trainer = Trainer(nlu_config, component_builder) persistor = create_persistor(storage) if training_data_endpoint is not None: training_data = load_data_from_endpoint(training_data_endpoint, nlu_config.language) else: training_data = load_data(data, nlu_config.language) interpreter = trainer.train(training_data, **kwargs) if path: persisted_path = trainer.persist(path, persistor, project, fixed_model_name) else: persisted_path = None return trainer, interpreter, persisted_path
def train_nlu(data, configs, model_dir): training_data = load_data(data) trainer = Trainer(config.load(configs)) trainer.train(training_data) model_directory = trainer.persist(model_dir, fixed_model_name="nlu") logger.info(f"Model trained. Stored in '{model_directory}'.") return model_directory
def test_run_cv_evaluation_with_response_selector(): training_data_obj = training_data.load_data("data/examples/rasa/demo-rasa.md") training_data_responses_obj = training_data.load_data( "data/examples/rasa/demo-rasa-responses.md" ) training_data_obj = training_data_obj.merge(training_data_responses_obj) training_data_obj.fill_response_phrases() nlu_config = config.load( "sample_configs/config_embedding_intent_response_selector.yml" ) n_folds = 2 intent_results, entity_results, response_selection_results = cross_validate( training_data_obj, n_folds, nlu_config ) assert len(intent_results.train["Accuracy"]) == n_folds assert len(intent_results.train["Precision"]) == n_folds assert len(intent_results.train["F1-score"]) == n_folds assert len(intent_results.test["Accuracy"]) == n_folds assert len(intent_results.test["Precision"]) == n_folds assert len(intent_results.test["F1-score"]) == n_folds assert len(response_selection_results.train["Accuracy"]) == n_folds assert len(response_selection_results.train["Precision"]) == n_folds assert len(response_selection_results.train["F1-score"]) == n_folds assert len(response_selection_results.test["Accuracy"]) == n_folds assert len(response_selection_results.test["Precision"]) == n_folds assert len(response_selection_results.test["F1-score"]) == n_folds # No entity extractor in pipeline assert len(entity_results.train) == 0 assert len(entity_results.test) == 0
def test_train_docker_and_docs_configs(config_file: Text): content = io_utils.read_yaml_file(config_file) loaded_config = config.load(config_file) assert len(loaded_config.component_names) > 1 assert loaded_config.language == content["language"]
def test_pipeline_looksup_registry(): pipeline_template = list(registered_pipeline_templates)[0] args = {"pipeline": pipeline_template} f = write_file_config(args) final_config = config.load(f.name) components = [c.get("name") for c in final_config.pipeline] assert components == registered_pipeline_templates[pipeline_template]
def train_nlu(data_path, configs, model_path): logging.basicConfig(filename=logfile, level=logging.DEBUG) training_data = load_data(data_path) trainer = Trainer(config.load(configs)) trainer.train(training_data) model_directory = trainer.persist(model_path, fixed_model_name='nlu') run_evaluation(data_path, model_directory)
def train_nlu(): training_data = load_data('./data/nlu.md') trainer = Trainer(config.load("config.yml")) trainer.train(training_data) model_directory = trainer.persist('./models/nlu/', fixed_model_name="current") return model_directory
def test_pipeline_registry_lookup(pipeline_template: Text): args = {"pipeline": pipeline_template} f = write_file_config(args) final_config = config.load(f.name) components = [c for c in final_config.pipeline] assert json.dumps(components, sort_keys=True) == json.dumps( registered_pipeline_templates[pipeline_template], sort_keys=True)
def load_training_data(data_file="../data/testData.json", config_file="../configs/config_spacy.yml"): training_data = load_data(data_file) trainer = Trainer(config.load(config_file)) trainer.train(training_data) model_directory = trainer.persist('./projects/default/') # where model_directory points to the model folder return model_directory
def train_nlu(lang="en", production_build=False): model_name = "production" if not production_build: model_name = "latest" training_data = load_data('./data/nlu/' + lang + "/") trainer = Trainer(config.load("config.yml")) trainer.train(training_data) trainer.persist('./models/nlu/' + lang + "/", fixed_model_name=model_name)
def test_train_featurizer(): (trained, _, _) = train.do_train( config.load('sample_configs/sample_use_featurizer.yml'), data='data/examples/dialogflow', path='models', project='current', fixed_model_name='use-featurizer') assert trained.pipeline
def train(cfg_name, project_name): from rasa.nlu import training_data cfg = config.load(cfg_name) trainer = Trainer(cfg, component_builder) training_data = training_data.load_data(data) trainer.train(training_data) trainer.persist("test_projects", project_name=project_name)
def train_model(td_file, config_file, model_dir): """trains a model using the training data and config creates model and returns the path to this model for evaluation""" td = load_data(td_file) trainer = Trainer(config.load(config_file)) trainer.train(td) model_loc = trainer.persist(model_dir) return model_loc
def test_nlu_interpreter(): #training_data = load_data("data/chitchat_nlu.md") training_data = load_data("data") trainer = Trainer(config.load("config.yml")) interpreter = trainer.train(training_data) test_interpreter_dir = trainer.persist("./tests/models", project_name="nlu") parsing = interpreter.parse('hello') assert parsing['intent']['name'] == 'greet' assert test_interpreter_dir
async def test_train_docker_and_docs_configs(config_file: Text, monkeypatch: MonkeyPatch): monkeypatch.setattr(autoconfig, "_dump_config", Mock()) importer = RasaFileImporter(config_file=config_file) imported_config = await importer.get_config() loaded_config = config.load(imported_config) assert len(loaded_config.component_names) > 1 assert loaded_config.language == imported_config["language"]
def load_entity_extractor(data_file, config_file): training_data = load_data(data_file) configuration = config.load(config_file) comp_builder = components.ComponentBuilder() #component = comp_builder.create_component("ner_crf",configuration) #ee = EntityExtractor(components.Component(configuration)) crf = CRFEntityExtractor() crf.train(training_data, configuration) model_directory = crf.persist('./models/default/') return model_directory
def _train_nlu_with_validated_data( config: Dict[Text, Text], nlu_data_directory: Text, output: Text, train_path: Optional[Text] = None, fixed_model_name: Optional[Text] = None, retrain_nlu: Union[bool, List[Text]] = True ) -> Optional[Text]: """Train NLU with validated training and config data.""" import rasa.nlu.train import re with ExitStack() as stack: models = {} from rasa.nlu import config as cfg_loader if train_path: # If the train path was provided, do nothing on exit. _train_path = train_path else: # Otherwise, create a temp train path and clean it up on exit. _train_path = stack.enter_context(TempDirectoryPath(tempfile.mkdtemp())) pattern = r'(\w\w)*(?=\.)' for file in os.listdir(nlu_data_directory): lang = re.search(pattern, file).groups()[0] if isinstance(retrain_nlu, bool) and retrain_nlu or lang in retrain_nlu: nlu_file_path = os.path.join(nlu_data_directory, file) print_color("Start training {} NLU model ...".format(lang), color=bcolors.OKBLUE) nlu_config = cfg_loader.load(config[lang]) nlu_config.language = lang _, models[lang], _ = rasa.nlu.train( nlu_config, nlu_file_path, _train_path, fixed_model_name="nlu-{}".format(lang) ) else: print_color("{} NLU data didn't change, skipping training...".format(lang), color=bcolors.OKBLUE) print_color("NLU model training completed.", color=bcolors.OKBLUE) if train_path is None: # Only NLU was trained new_fingerprint = model.model_fingerprint( config, nlu_data=nlu_data_directory ) return _package_model( new_fingerprint=new_fingerprint, output_path=output, train_path=_train_path, fixed_model_name=fixed_model_name, model_prefix="nlu-", ) return _train_path
def comps(self): """ $ python -m saai.saai_cli comps $ python -m saai comps :return: """ from rasa.nlu import config conf = config.load('saai/sample_configs/config_tokenizer.yml') # conf.for_component('DucklingHTTPExtractor') return conf.component_names
def __init__(self): try: test = Interpreter.load("./models/nlu/current") self.interpreter = test except Exception: training_data = load_data("./data/nlu.md") trainer = Trainer(config.load("config.yml")) self.interpreter = trainer.train(training_data) model_directory = trainer.persist("./models/nlu", fixed_model_name="current") self.music_verbs = ['Riproduci', 'Suona', 'Fai partire', 'Avvia']
def train_nlu(): from rasa.nlu.training_data import load_data from rasa.nlu import config from rasa.nlu.model import Trainer training_data = load_data('data/nlu.md') trainer = Trainer(config.load("config.yml")) trainer.train(training_data) model_directory = trainer.persist('models/nlu/', fixed_model_name="current") return model_directory
def train(self): # loading the nlu training samples training_data = load_data(self.data) # trainer to educate our pipeline trainer = Trainer(config.load(self.pipeline)) # train the model self.interpreter = trainer.train(training_data) # store it for future use self.model_directory = trainer.persist( "opennlu/data/model/rasa", fixed_model_name=self.name, persist_nlu_training_data=training_data)
def test_override_defaults_supervised_embeddings_pipeline(): cfg = config.load("data/test/config_embedding_test.yml") builder = ComponentBuilder() component1_cfg = cfg.for_component(0) component1 = builder.create_component(component1_cfg, cfg) assert component1.max_ngram == 3 component2_cfg = cfg.for_component(1) component2 = builder.create_component(component2_cfg, cfg) assert component2.epochs == 10
def test_validate_required_components_from_data( config_path: Text, data_path: Text, expected_warning_excerpts: List[Text]): loaded_config = config.load(config_path) trainer = Trainer(loaded_config) training_data = load_data(data_path) with pytest.warns(UserWarning) as record: components.validate_required_components_from_data( trainer.pipeline, training_data) assert len(record) == 1 assert all([excerpt in record[0].message.args[0]] for excerpt in expected_warning_excerpts)
def train_test(td_file, config_file, model_dir): # helper function to split into test and train and evaluate on results. td = load_data(td_file) trainer = Trainer(config.load(config_file)) train, test = td.train_test_split(train_frac=0.6) trainer.train(train) model_loc = trainer.persist(model_dir) with open("data/tmp/temp_test.json", "w", encoding="utf8") as f: f.write(test.as_json()) with open("data/temp_train.json", "w", encoding="utf8") as f: f.write(train.as_json()) evaluate_model("data/tmp/temp_test.json", model_loc)