def train_test_metrics(train_dataset_path, test_dataset_path, output_path, config_path=None, exclude_slot_metrics=False, include_errors=False, verbose=False): if verbose: set_nlu_logger(logging.DEBUG) if config_path is not None: with Path(config_path).open("r", encoding="utf-8") as f: config = json.load(f) engine_cls = make_engine_cls(config) else: engine_cls = SnipsNLUEngine metrics_args = dict(train_dataset=train_dataset_path, test_dataset=test_dataset_path, engine_class=engine_cls, include_slot_metrics=not exclude_slot_metrics) from snips_nlu_metrics import compute_train_test_metrics metrics = compute_train_test_metrics(**metrics_args) if not include_errors: metrics.pop("parsing_errors") with Path(output_path).open(mode="w", encoding="utf8") as f: f.write(json_string(metrics))
def persist(self, path): """Persists the object at the given path""" path.mkdir() crf_model_file = None if self.crf_model is not None: crf_model_file = CRF_MODEL_FILENAME destination = path / crf_model_file shutil.copy(self.crf_model.modelfile.name, str(destination)) # On windows, permissions of crfsuite files are correct if os.name == "posix": umask = os.umask(0o022) # retrieve the system umask os.umask(umask) # restore the sys umask to its original value os.chmod(str(destination), 0o644 & ~umask) model = { "language_code": self.language, "intent": self.intent, "crf_model_file": crf_model_file, "slot_name_mapping": self.slot_name_mapping, "config": self.config.to_dict(), } model_json = json_string(model) model_path = path / "slot_filler.json" with model_path.open(mode="w", encoding="utf8") as f: f.write(model_json) self.persist_metadata(path)
def setUp(self): super(TestCLI, self).setUp() if not self.fixture_dir.exists(): self.fixture_dir.mkdir() dataset_stream = io.StringIO(u""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups - i want [number_of_cups] cups of [beverage_temperature](boiling hot) tea pls - can you prepare [number_of_cups] cup of [beverage_temperature](cold) tea ? --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee - can you prepare [number_of_cups] cup of coffee""") beverage_dataset = Dataset.from_yaml_files("en", [dataset_stream]).json self.beverage_dataset_path = self.fixture_dir / "beverage_dataset.json" if self.beverage_dataset_path.exists(): self.beverage_dataset_path.unlink() with self.beverage_dataset_path.open(mode="w") as f: f.write(json_string(beverage_dataset)) self.tmp_file_path = self.fixture_dir / next( tempfile._get_candidate_names()) while self.tmp_file_path.exists(): self.tmp_file_path = self.fixture_dir / next( tempfile._get_candidate_names())
def _build_gazetteer_parser(target_dir, gazetteer_entities, language): from snips_nlu_parsers import get_builtin_entity_shortname gazetteer_parser_name = "gazetteer_entity_parser" gazetteer_parser_path = target_dir / gazetteer_parser_name gazetteer_parser_metadata = [] for ent in sorted(gazetteer_entities): # Fetch the compiled parser in the resources source_parser_path = find_gazetteer_entity_data_path(language, ent) short_name = get_builtin_entity_shortname(ent).lower() target_parser_path = gazetteer_parser_path / short_name parser_metadata = { "entity_identifier": ent, "entity_parser": short_name } gazetteer_parser_metadata.append(parser_metadata) # Copy the single entity parser shutil.copytree(str(source_parser_path), str(target_parser_path)) # Dump the parser metadata gazetteer_entity_parser_metadata = { "parsers_metadata": gazetteer_parser_metadata } gazetteer_parser_metadata_path = gazetteer_parser_path / "metadata.json" with gazetteer_parser_metadata_path.open("w", encoding="utf-8") as f: f.write(json_string(gazetteer_entity_parser_metadata)) return gazetteer_parser_name
def test_should_be_deserializable(self, mocked_cooccurrence_load, mocked_tfidf_load): # Given mocked_tfidf_load.return_value = "tfidf_vectorizer" mocked_cooccurrence_load.return_value = "cooccurrence_vectorizer" language = LANGUAGE_EN config = FeaturizerConfig() featurizer_dict = { "language_code": language, "tfidf_vectorizer": "tfidf_vectorizer", "cooccurrence_vectorizer": "cooccurrence_vectorizer", "config": config.to_dict() } self.tmp_file_path.mkdir() featurizer_path = self.tmp_file_path / "featurizer.json" with featurizer_path.open("w", encoding="utf-8") as f: f.write(json_string(featurizer_dict)) # When featurizer = Featurizer.from_path(self.tmp_file_path) # Then self.assertEqual(language, featurizer.language) self.assertEqual("tfidf_vectorizer", featurizer.tfidf_vectorizer) self.assertEqual("cooccurrence_vectorizer", featurizer.cooccurrence_vectorizer) self.assertDictEqual(config.to_dict(), featurizer.config.to_dict())
def persist(self, path): path.mkdir() # Persist the vectorizers tfidf_vectorizer = None if self.tfidf_vectorizer: tfidf_vectorizer = self.tfidf_vectorizer.unit_name tfidf_vectorizer_path = path / tfidf_vectorizer self.tfidf_vectorizer.persist(tfidf_vectorizer_path) cooccurrence_vectorizer = None if self.cooccurrence_vectorizer: cooccurrence_vectorizer = self.cooccurrence_vectorizer.unit_name cooccurrence_vectorizer_path = path / cooccurrence_vectorizer self.cooccurrence_vectorizer.persist(cooccurrence_vectorizer_path) # Persist main object self_as_dict = { "language_code": self.language, "tfidf_vectorizer": tfidf_vectorizer, "cooccurrence_vectorizer": cooccurrence_vectorizer, "config": self.config.to_dict() } featurizer_path = path / "featurizer.json" with featurizer_path.open("w", encoding="utf-8") as f: f.write(json_string(self_as_dict)) # Persist metadata self.persist_metadata(path)
def persist(self, path): """Persists the object at the given path""" path.mkdir() featurizer = None if self.featurizer is not None: featurizer = "featurizer" featurizer_path = path / featurizer self.featurizer.persist(featurizer_path) coeffs = None intercept = None t_ = None if self.classifier is not None: coeffs = self.classifier.coef_.tolist() intercept = self.classifier.intercept_.tolist() t_ = self.classifier.t_ self_as_dict = { "config": self.config.to_dict(), "coeffs": coeffs, "intercept": intercept, "t_": t_, "intent_list": self.intent_list, "featurizer": featurizer } classifier_json = json_string(self_as_dict) with (path / "intent_classifier.json").open(mode="w") as f: f.write(classifier_json) self.persist_metadata(path)
def persist(self, path): path.mkdir() vectorizer_ = None if self._tfidf_vectorizer is not None: vocab = {k: int(v) for k, v in iteritems(self.vocabulary)} idf_diag = self.idf_diag.tolist() vectorizer_ = { "vocab": vocab, "idf_diag": idf_diag } builtin_entity_scope = None if self.builtin_entity_scope is not None: builtin_entity_scope = list(self.builtin_entity_scope) self_as_dict = { "vectorizer": vectorizer_, "language_code": self.language, "builtin_entity_scope": builtin_entity_scope, "config": self.config.to_dict(), } vectorizer_path = path / "vectorizer.json" with vectorizer_path.open("w", encoding="utf-8") as f: f.write(json_string(self_as_dict)) self.persist_metadata(path)
def generate_dataset(language, *files): """Create a Snips NLU dataset from text friendly files""" language = unicode_string(language) if any(f.endswith(".yml") or f.endswith(".yaml") for f in files): dataset = Dataset.from_yaml_files(language, list(files)) else: dataset = Dataset.from_files(language, list(files)) print(json_string(dataset.json, indent=2, sort_keys=True))
def persist(self, path): model = { "language": self.language, "slots_keywords": self.slots_keywords, "config": self.config.to_dict() } with path.open(mode="w") as f: f.write(json_string(model))
def print_parsing_result(engine, query, intents_filter): from snips_nlu.common.utils import unicode_string, json_string query = unicode_string(query) json_dump = json_string(engine.parse(query, intents_filter), sort_keys=True, indent=2) print(json_dump)
def persist(self, path): """Persists the object at the given path""" path.mkdir() parser_json = json_string(self.to_dict()) parser_path = path / "intent_parser.json" with parser_path.open(mode="w") as f: f.write(parser_json) self.persist_metadata(path)
def persist(self, path): path = Path(path) path.mkdir() parser_directory = "parser" metadata = { "language": self.language, "parser_usage": self.parser_usage.value, "parser_directory": parser_directory } with (path / "metadata.json").open(mode="w", encoding="utf8") as f: f.write(json_string(metadata)) self._parser.persist(path / parser_directory)
def persist_resources(resources, resources_dest_path, required_resources): if not required_resources: return resources_dest_path.mkdir() metadata = deepcopy(resources[METADATA]) # Update metadata and keep only required resources if not required_resources.get(NOISE, False): metadata[NOISE] = None if not required_resources.get(STOP_WORDS, False): metadata[STOP_WORDS] = None if not required_resources.get(STEMS, False): metadata[STEMS] = None metadata[GAZETTEERS] = sorted(required_resources.get(GAZETTEERS, [])) metadata[WORD_CLUSTERS] = sorted(required_resources.get(WORD_CLUSTERS, [])) metadata_dest_path = resources_dest_path / "metadata.json" metadata_json = json_string(metadata) with metadata_dest_path.open(encoding="utf8", mode="w") as f: f.write(metadata_json) if metadata[NOISE] is not None: noise_path = (resources_dest_path / metadata[NOISE]) \ .with_suffix(".txt") _persist_noise(get_noise(resources), noise_path) if metadata[STOP_WORDS] is not None: stop_words_path = (resources_dest_path / metadata[STOP_WORDS]) \ .with_suffix(".txt") _persist_stop_words(get_stop_words(resources), stop_words_path) if metadata[STEMS] is not None: stemming_dir = resources_dest_path / "stemming" stemming_dir.mkdir() stems_path = (stemming_dir / metadata[STEMS]).with_suffix(".txt") _persist_stems(get_stems(resources), stems_path) if metadata[GAZETTEERS]: gazetteers_dir = resources_dest_path / "gazetteers" gazetteers_dir.mkdir() for name in metadata[GAZETTEERS]: gazetteer_path = (gazetteers_dir / name).with_suffix(".txt") _persist_gazetteer(get_gazetteer(resources, name), gazetteer_path) if metadata[WORD_CLUSTERS]: clusters_dir = resources_dest_path / "word_clusters" clusters_dir.mkdir() for name in metadata[WORD_CLUSTERS]: clusters_path = (clusters_dir / name).with_suffix(".txt") _persist_word_clusters(get_word_cluster(resources, name), clusters_path)
def generate_dataset(language, *yaml_files): """Creates a Snips NLU dataset from YAML definition files Check :meth:`.Intent.from_yaml` and :meth:`.Entity.from_yaml` for the format of the YAML files. Args: language (str): language of the dataset (iso code) *yaml_files: list of intent and entity definition files in YAML format. Returns: None. The json dataset output is printed out on stdout. """ language = unicode_string(language) dataset = Dataset.from_yaml_files(language, list(yaml_files)) print(json_string(dataset.json, indent=2, sort_keys=True))
def _build_builtin_parser(language, gazetteer_entities): with temp_dir() as serialization_dir: gazetteer_entity_parser = None if gazetteer_entities: gazetteer_entity_parser = _build_gazetteer_parser( serialization_dir, gazetteer_entities, language) metadata = { "language": language.upper(), "gazetteer_parser": gazetteer_entity_parser } metadata_path = serialization_dir / "metadata.json" with metadata_path.open("w", encoding="utf-8") as f: f.write(json_string(metadata)) parser = _BuiltinEntityParser.from_path(serialization_dir) return BuiltinEntityParser(parser)
def cross_val_metrics(dataset_path, output_path, config_path=None, nb_folds=5, train_size_ratio=1.0, exclude_slot_metrics=False, include_errors=False, verbose=0): import json import logging from pathlib import Path from snips_nlu_metrics import compute_cross_val_metrics from snips_nlu import SnipsNLUEngine from snips_nlu.cli.utils import set_nlu_logger from snips_nlu.common.utils import json_string if verbose == 1: set_nlu_logger(logging.INFO) elif verbose >= 2: set_nlu_logger(logging.DEBUG) def progression_handler(progress): print("%d%%" % int(progress * 100)) if config_path is not None: with Path(config_path).open("r", encoding="utf-8") as f: config = json.load(f) engine_cls = make_engine_cls(config) else: engine_cls = SnipsNLUEngine metrics_args = dict(dataset=dataset_path, engine_class=engine_cls, progression_handler=progression_handler, nb_folds=nb_folds, train_size_ratio=train_size_ratio, include_slot_metrics=not exclude_slot_metrics, slot_matching_lambda=_match_trimmed_values) metrics = compute_cross_val_metrics(**metrics_args) if not include_errors: metrics.pop("parsing_errors") with Path(output_path).open(mode="w", encoding="utf8") as f: f.write(json_string(metrics))
def persist(self, path): path.mkdir() builtin_entity_scope = None if self.builtin_entity_scope is not None: builtin_entity_scope = list(self.builtin_entity_scope) self_as_dict = { "language_code": self.language, "word_pairs": {i: list(p) for p, i in iteritems(self.word_pairs)}, "builtin_entity_scope": builtin_entity_scope, "config": self.config.to_dict() } vectorizer_json = json_string(self_as_dict) vectorizer_path = path / "vectorizer.json" with vectorizer_path.open(mode="w") as f: f.write(vectorizer_json) self.persist_metadata(path)
def persist(self, path): """Persists the object at the given path""" path.mkdir() crf_model_file = None if self.crf_model is not None: destination = path / Path(self.crf_model.modelfile.name).name shutil.copy(self.crf_model.modelfile.name, str(destination)) crf_model_file = str(destination.name) model = { "language_code": self.language, "intent": self.intent, "crf_model_file": crf_model_file, "slot_name_mapping": self.slot_name_mapping, "config": self.config.to_dict(), } model_json = json_string(model) model_path = path / "slot_filler.json" with model_path.open(mode="w") as f: f.write(model_json) self.persist_metadata(path)
def persist(self, path): """Persists the object at the given path""" path.mkdir() sorted_slot_fillers = sorted(iteritems(self.slot_fillers)) slot_fillers = [] for i, (intent, slot_filler) in enumerate(sorted_slot_fillers): slot_filler_name = "slot_filler_%s" % i slot_filler.persist(path / slot_filler_name) slot_fillers.append({ "intent": intent, "slot_filler_name": slot_filler_name }) if self.intent_classifier is not None: self.intent_classifier.persist(path / "intent_classifier") model = {"config": self.config.to_dict(), "slot_fillers": slot_fillers} model_json = json_string(model) model_path = path / "intent_parser.json" with model_path.open(mode="w") as f: f.write(model_json) self.persist_metadata(path)
def train_test_metrics(train_dataset_path, test_dataset_path, output_path, config_path=None, exclude_slot_metrics=False, include_errors=False, verbosity=0): import json import logging from pathlib import Path from snips_nlu_metrics import compute_train_test_metrics from snips_nlu import SnipsNLUEngine from snips_nlu.cli.utils import set_nlu_logger from snips_nlu.common.utils import json_string if verbosity == 1: set_nlu_logger(logging.INFO) elif verbosity >= 2: set_nlu_logger(logging.DEBUG) if config_path is not None: with Path(config_path).open("r", encoding="utf-8") as f: config = json.load(f) engine_cls = make_engine_cls(config) else: engine_cls = SnipsNLUEngine metrics_args = dict(train_dataset=train_dataset_path, test_dataset=test_dataset_path, engine_class=engine_cls, include_slot_metrics=not exclude_slot_metrics, slot_matching_lambda=_match_trimmed_values) metrics = compute_train_test_metrics(**metrics_args) if not include_errors: metrics.pop("parsing_errors") with Path(output_path).open(mode="w", encoding="utf8") as f: f.write(json_string(metrics))
def cross_val_metrics(dataset_path, output_path, config_path=None, nb_folds=5, train_size_ratio=1.0, exclude_slot_metrics=False, include_errors=False, verbose=False): if verbose: set_nlu_logger(logging.DEBUG) def progression_handler(progress): print("%d%%" % int(progress * 100)) if config_path is not None: with Path(config_path).open("r", encoding="utf-8") as f: config = json.load(f) engine_cls = make_engine_cls(config) else: engine_cls = SnipsNLUEngine metrics_args = dict( dataset=dataset_path, engine_class=engine_cls, progression_handler=progression_handler, nb_folds=nb_folds, train_size_ratio=train_size_ratio, include_slot_metrics=not exclude_slot_metrics, ) from snips_nlu_metrics import compute_cross_val_metrics metrics = compute_cross_val_metrics(**metrics_args) if not include_errors: metrics.pop("parsing_errors") with Path(output_path).open(mode="w", encoding="utf8") as f: f.write(json_string(metrics))
def persist(self, path): """Persists the NLU engine at the given directory path Args: path (str or pathlib.Path): the location at which the nlu engine must be persisted. This path must not exist when calling this function. Raises: PersistingError: when persisting to a path which already exists """ path.mkdir() parsers_count = defaultdict(int) intent_parsers = [] for parser in self.intent_parsers: parser_name = parser.unit_name parsers_count[parser_name] += 1 count = parsers_count[parser_name] if count > 1: parser_name = "{n}_{c}".format(n=parser_name, c=count) parser_path = path / parser_name parser.persist(parser_path) intent_parsers.append(parser_name) config = None if self.config is not None: config = self.config.to_dict() builtin_entity_parser = None if self.builtin_entity_parser is not None: builtin_entity_parser = "builtin_entity_parser" builtin_entity_parser_path = path / builtin_entity_parser self.builtin_entity_parser.persist(builtin_entity_parser_path) custom_entity_parser = None if self.custom_entity_parser is not None: custom_entity_parser = "custom_entity_parser" custom_entity_parser_path = path / custom_entity_parser self.custom_entity_parser.persist(custom_entity_parser_path) model = { "unit_name": self.unit_name, "dataset_metadata": self.dataset_metadata, "intent_parsers": intent_parsers, "custom_entity_parser": custom_entity_parser, "builtin_entity_parser": builtin_entity_parser, "config": config, "model_version": __model_version__, "training_package_version": __version__ } model_json = json_string(model) model_path = path / "nlu_engine.json" with model_path.open(mode="w") as f: f.write(model_json) if self.fitted: required_resources = self.config.get_required_resources() language = self.dataset_metadata["language_code"] resources_path = path / "resources" resources_path.mkdir() persist_resources(self.resources, resources_path / language, required_resources)
def writeJsonContent(path, json_dict): json_content = json_string(json_dict) with path.open(mode="w", encoding="utf8") as f: f.write(json_content)
def persist(self, path): with path.open("r", encoding="utf-8") as f: f.write(json_string(self.entities))
def persist(self, path): path = Path(path) path.mkdir() with (path / "metadata.json").open(mode="w", encoding="utf8") as f: unit_dict = {"unit_name": self.unit_name, "fitted": self.fitted} f.write(json_string(unit_dict))
def persist_metadata(self, path, **kwargs): metadata = {"unit_name": self.unit_name} metadata.update(kwargs) metadata_json = json_string(metadata) with (path / "metadata.json").open(mode="w") as f: f.write(metadata_json)
def print_parsing_result(engine, query): query = unicode_string(query) json_dump = json_string(engine.parse(query), sort_keys=True, indent=2) print(json_dump)