def test_write_to_and_read(default_model_storage: ModelStorage): test_filename = "file.txt" test_file_content = "hi" test_sub_filename = "sub_file" test_sub_dir_name = "sub_directory" test_sub_file_content = "sub file" resource = Resource("some_node123") # Fill model storage for resource with default_model_storage.write_to(resource) as resource_directory: file = resource_directory / test_filename file.write_text(test_file_content) sub_directory = resource_directory / test_sub_dir_name sub_directory.mkdir() file_in_sub_directory = sub_directory / test_sub_filename file_in_sub_directory.write_text(test_sub_file_content) # Read written resource data from model storage to see whether all expected # content is there with default_model_storage.read_from(resource) as resource_directory: assert (resource_directory / test_filename).read_text() == test_file_content assert (resource_directory / test_sub_dir_name / test_sub_filename).read_text() == test_sub_file_content
def test_read_from_not_existing_resource(default_model_storage: ModelStorage): with default_model_storage.write_to( Resource("resource1")) as temporary_directory: file = temporary_directory / "file.txt" file.write_text("test") with pytest.raises(ValueError): with default_model_storage.read_from( Resource("a different resource")) as _: pass
def test_create_model_package_with_non_existing_dir( tmp_path: Path, default_model_storage: ModelStorage): path = tmp_path / "some_dir" / "another" / "model.tar.gz" default_model_storage.create_model_package( path, GraphModelConfiguration(GraphSchema({}), GraphSchema({}), TrainingType.BOTH, None, None, "nlu"), Domain.empty(), ) assert path.exists()
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> SklearnIntentClassifier: """Loads trained component (see parent class for full docstring).""" from sklearn.preprocessing import LabelEncoder try: with model_storage.read_from(resource) as model_dir: file_name = cls.__name__ classifier_file = model_dir / f"{file_name}_classifier.pkl" if classifier_file.exists(): classifier = io_utils.json_unpickle(classifier_file) encoder_file = model_dir / f"{file_name}_encoder.pkl" classes = io_utils.json_unpickle(encoder_file) encoder = LabelEncoder() encoder.classes_ = classes return cls(config, model_storage, resource, classifier, encoder) except ValueError: logger.debug( f"Failed to load '{cls.__name__}' from model storage. Resource " f"'{resource.name}' doesn't exist.") return cls(config, model_storage, resource)
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> KeywordIntentClassifier: """Loads trained component (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_dir: keyword_file = model_dir / f"{cls.__name__}.json" intent_keyword_map = rasa.shared.utils.io.read_json_file( keyword_file) except ValueError: logger.warning( f"Failed to load {cls.__class__.__name__} from model storage. Resource " f"'{resource.name}' doesn't exist.") intent_keyword_map = None return cls( config, model_storage, resource, execution_context, intent_keyword_map, )
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> MitieIntentClassifier: """Loads component for inference see parent class for full docstring).""" import mitie text_categorizer = None try: with model_storage.read_from(resource) as directory: text_categorizer = mitie.text_categorizer( str(directory / "model.dat")) except ( ValueError, Exception, ): # the latter is thrown by the `mitie.text_categorizer` logger.warning( f"Failed to load {cls.__class__.__name__} from model storage. Resource " f"'{resource.name}' doesn't exist.") return cls(config, model_storage, resource, text_categorizer)
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> RegexEntityExtractorGraphComponent: """Loads trained component (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_path: regex_file = model_path / cls.REGEX_FILE_NAME patterns = rasa.shared.utils.io.read_json_file(regex_file) return RegexEntityExtractorGraphComponent( config, model_storage=model_storage, resource=resource, patterns=patterns, ) except ValueError: rasa.shared.utils.io.raise_warning( f"Failed to load {cls.__name__} from model storage. " f"This can happen if the model could not be trained because regexes " f"could not be extracted from the given training data - and hence " f"could not be persisted.") return RegexEntityExtractorGraphComponent( config, model_storage=model_storage, resource=resource, )
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> LexicalSyntacticFeaturizer: """Loads trained component (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_path: feature_to_idx_dict = rasa.utils.io.json_unpickle( model_path / cls.FILENAME_FEATURE_TO_IDX_DICT, encode_non_string_keys=True, ) return cls( config=config, model_storage=model_storage, resource=resource, execution_context=execution_context, feature_to_idx_dict=feature_to_idx_dict, ) except ValueError: logger.debug( f"Failed to load `{cls.__class__.__name__}` from model storage. " f"Resource '{resource.name}' doesn't exist.") return cls( config=config, model_storage=model_storage, resource=resource, execution_context=execution_context, )
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> MemoizationPolicy: """Loads a trained policy (see parent class for full docstring).""" featurizer = None lookup = None try: with model_storage.read_from(resource) as path: metadata_file = Path(path) / cls._metadata_filename() metadata = rasa.shared.utils.io.read_json_file(metadata_file) lookup = metadata["lookup"] if (Path(path) / FEATURIZER_FILE).is_file(): featurizer = TrackerFeaturizer.load(path) except (ValueError, FileNotFoundError, FileIOException): logger.warning( f"Couldn't load metadata for policy '{cls.__name__}' as the persisted " f"metadata couldn't be loaded.") return cls( config, model_storage, resource, execution_context, featurizer=featurizer, lookup=lookup, )
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> MitieEntityExtractor: """Loads trained component (see parent class for full docstring).""" import mitie try: with model_storage.read_from(resource) as model_path: ner_file = model_path / cls.MITIE_RESOURCE_FILE if not ner_file.exists(): raise FileNotFoundError( f"Expected a MITIE extractor file at {ner_file}.") ner = mitie.named_entity_extractor(str(ner_file)) return cls(config, model_storage, resource, ner=ner) except (FileNotFoundError, ValueError) as e: logger.debug( f"Failed to load {cls.__name__} from model storage. " f"This can happen if the model could not be trained because regexes " f"could not be extracted from the given training data - and hence " f"could not be persisted. Error: {e}.") return cls(config, model_storage, resource)
def test_loading_from_resource_eager(default_model_storage: ModelStorage): previous_resource = Resource("previous resource") test_value = {"test": "test value"} # Pretend resource persisted itself before with default_model_storage.write_to(previous_resource) as directory: rasa.shared.utils.io.dump_obj_as_json_to_file(directory / "test.json", test_value) node_name = "some_name" node = GraphNode( node_name=node_name, component_class=PersistableTestComponent, constructor_name="load", component_config={}, fn_name="run_inference", inputs={}, eager=True, model_storage=default_model_storage, # The `GraphComponent` should load from this resource resource=previous_resource, execution_context=ExecutionContext(GraphSchema({}), "123"), ) actual_node_name, value = node() assert actual_node_name == node_name assert value == test_value
def test_writing_to_resource_during_training( default_model_storage: ModelStorage): node_name = "some_name" test_value_for_sub_directory = {"test": "test value sub dir"} test_value = {"test dir": "test value dir"} node = GraphNode( node_name=node_name, component_class=PersistableTestComponent, constructor_name="create", component_config={ "test_value": test_value, "test_value_for_sub_directory": test_value_for_sub_directory, }, fn_name="train", inputs={}, eager=False, model_storage=default_model_storage, resource=None, execution_context=ExecutionContext(GraphSchema({}), "123"), ) _, resource = node() assert resource == Resource(node_name) with default_model_storage.read_from(resource) as directory: assert (rasa.shared.utils.io.read_json_file(directory / "test.json") == test_value) assert (rasa.shared.utils.io.read_json_file( directory / "sub_dir" / "test.json") == test_value_for_sub_directory)
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> ResponseSelector: """Loads the trained model from the provided directory.""" model: ResponseSelector = super().load(config, model_storage, resource, execution_context, **kwargs) try: with model_storage.read_from(resource) as model_path: file_name = cls.__name__ responses = rasa.shared.utils.io.read_json_file( model_path / f"{file_name}.responses.json") all_retrieval_intents = rasa.shared.utils.io.read_json_file( model_path / f"{file_name}.retrieval_intents.json") model.responses = responses model.all_retrieval_intents = all_retrieval_intents return model except ValueError: logger.debug( f"Failed to load {cls.__name__} from model storage. Resource " f"'{resource.name}' doesn't exist.") return cls(config, model_storage, resource, execution_context)
def from_cache(cls, node_name: Text, directory: Path, model_storage: ModelStorage) -> Resource: """Loads a `Resource` from the cache. This automatically loads the persisted resource into the given `ModelStorage`. Args: node_name: The node name of the `Resource`. directory: The directory with the cached `Resource`. model_storage: The `ModelStorage` which the cached `Resource` will be added to so that the `Resource` is accessible for other graph nodes. Returns: The ready-to-use and accessible `Resource`. """ logger.debug(f"Loading resource '{node_name}' from cache.") resource = Resource(node_name) with model_storage.write_to(resource) as resource_directory: rasa.utils.common.copy_directory(directory, resource_directory) logger.debug( f"Successfully initialized resource '{node_name}' from cache.") return resource
async def test_train_model_checkpointing( default_model_storage: ModelStorage, default_diet_resource: Resource, create_train_load_and_process_diet: Callable[..., Message], ): create_train_load_and_process_diet({ EPOCHS: 2, EVAL_NUM_EPOCHS: 1, EVAL_NUM_EXAMPLES: 10, CHECKPOINT_MODEL: True }) with default_model_storage.read_from(default_diet_resource) as model_dir: checkpoint_dir = model_dir / "checkpoints" assert checkpoint_dir.is_dir() """ Tricky to validate the *exact* number of files that should be there, however there must be at least the following: - metadata.json - checkpoint - component_1_CountVectorsFeaturizer (as per the pipeline above) - component_2_DIETClassifier files (more than 1 file) """ all_files = list(model_dir.rglob("*.*")) assert len(all_files) > 4
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> EntitySynonymMapperComponent: """Loads trained component (see parent class for full docstring).""" synonyms = None try: with model_storage.read_from(resource) as storage: entity_synonyms_file = ( storage / EntitySynonymMapperComponent.SYNONYM_FILENAME) if os.path.isfile(entity_synonyms_file): synonyms = rasa.shared.utils.io.read_json_file( entity_synonyms_file) else: synonyms = None rasa.shared.utils.io.raise_warning( f"Failed to load synonyms file from '{entity_synonyms_file}'.", docs=DOCS_URL_TRAINING_DATA + "#synonyms", ) except ValueError: logger.warning( f"Failed to load {cls.__class__.__name__} from model storage. Resource " f"'{resource.name}' doesn't exist.") return cls(config, model_storage, resource, synonyms)
def test_resource_with_model_storage(default_model_storage: ModelStorage, tmp_path: Path, temp_cache: TrainingCache): node_name = "some node" resource = Resource(node_name) test_filename = "persisted_model.json" test_content = {"epochs": 500} with default_model_storage.write_to(resource) as temporary_directory: rasa.shared.utils.io.dump_obj_as_json_to_file( temporary_directory / test_filename, test_content) test_fingerprint_key = uuid.uuid4().hex test_output_fingerprint_key = uuid.uuid4().hex temp_cache.cache_output( test_fingerprint_key, resource, test_output_fingerprint_key, default_model_storage, ) new_model_storage_location = tmp_path / "new_model_storage" new_model_storage_location.mkdir() new_model_storage = LocalModelStorage(new_model_storage_location) restored_resource = temp_cache.get_cached_result( test_output_fingerprint_key, node_name, new_model_storage) assert isinstance(restored_resource, Resource) assert restored_resource == restored_resource with new_model_storage.read_from(restored_resource) as temporary_directory: cached_content = rasa.shared.utils.io.read_json_file( temporary_directory / test_filename) assert cached_content == test_content
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> GraphComponent: """Loads a `FineTuningValidator` (see parent class for full docstring).""" try: with model_storage.read_from(resource) as path: fingerprints = rasa.shared.utils.io.read_json_file( filename=path / cls.FILENAME ) return cls( config=config, model_storage=model_storage, execution_context=execution_context, resource=resource, fingerprints=fingerprints, ) except ValueError as e: raise InvalidConfigException( f"Loading {cls.__name__} failed. Ensure that the {cls.__name__} " f"is part of your training graph and re-train your models before " f"attempting to use the {cls.__name__}." ) from e
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> Policy: """Loads a trained policy (see parent class for full docstring).""" featurizer = None try: with model_storage.read_from(resource) as path: if (Path(path) / FEATURIZER_FILE).is_file(): featurizer = TrackerFeaturizer.load(path) config.update(kwargs) except (ValueError, FileNotFoundError, FileIOException): logger.debug( f"Couldn't load metadata for policy '{cls.__name__}' as the persisted " f"metadata couldn't be loaded." ) return cls( config, model_storage, resource, execution_context, featurizer=featurizer, )
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> RegexFeaturizer: """Loads trained component (see parent class for full docstring).""" known_patterns = None try: with model_storage.read_from(resource) as model_dir: patterns_file_name = model_dir / "patterns.pkl" known_patterns = rasa.shared.utils.io.read_json_file( patterns_file_name) except (ValueError, FileNotFoundError): logger.warning( f"Failed to load `{cls.__class__.__name__}` from model storage. " f"Resource '{resource.name}' doesn't exist.") return cls( config, model_storage, resource, execution_context, known_patterns=known_patterns, )
async def test_doesnt_checkpoint_with_zero_eval_num_examples( create_diet: Callable[..., DIETClassifierGraphComponent], default_model_storage: ModelStorage, default_diet_resource: Resource, train_load_and_process_diet: Callable[..., Message], ): with pytest.warns(UserWarning) as warning: classifier = create_diet( { EPOCHS: 2, CHECKPOINT_MODEL: True, EVAL_NUM_EXAMPLES: 0, EVAL_NUM_EPOCHS: 1, } ) warn_text = ( f"You have opted to save the best model, but the value of " f"'{EVAL_NUM_EXAMPLES}' is not greater than 0. No checkpoint model " f"will be saved." ) assert len([w for w in warning if warn_text in str(w.message)]) == 1 train_load_and_process_diet(classifier) with default_model_storage.read_from(default_diet_resource) as model_dir: checkpoint_dir = model_dir / "checkpoints" assert not checkpoint_dir.is_dir()
def test_domain_provider_provides_and_persists_domain( default_model_storage: ModelStorage, default_execution_context: ExecutionContext, config_path: Text, domain_path: Text, domain: Domain, ): resource = Resource("xy") component = DomainProvider.create( DomainProvider.get_default_config(), default_model_storage, resource, default_execution_context, ) assert isinstance(component, DomainProvider) importer = TrainingDataImporter.load_from_config(config_path, domain_path) training_domain = component.provide_train(importer) assert isinstance(training_domain, Domain) assert domain.fingerprint() == training_domain.fingerprint() with default_model_storage.read_from(resource) as d: match = list(d.glob("**/domain.yml")) assert len(match) == 1 assert match[0].is_file() assert domain.fingerprint() == Domain.from_path(match[0]).fingerprint() component_2 = DomainProvider.load( {}, default_model_storage, resource, default_execution_context ) inference_domain = component_2.provide_inference() assert isinstance(inference_domain, Domain) assert domain.fingerprint() == inference_domain.fingerprint()
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> CRFEntityExtractorGraphComponent: """Loads trained component (see parent class for full docstring).""" import joblib try: entity_taggers = {} with model_storage.read_from(resource) as model_dir: file_names = list(model_dir.glob("**/*.pkl")) if not file_names: logger.debug( "Failed to load model for 'CRFEntityExtractor'. " "Maybe you did not provide enough training data and " "no model was trained." ) return cls(config, model_storage, resource) for file_name in file_names: name = file_name.stem entity_taggers[name] = joblib.load(file_name) return cls(config, model_storage, resource, entity_taggers) except ValueError: logger.warning( f"Failed to load {cls.__name__} from model storage. Resource " f"'{resource.name}' doesn't exist." ) return cls(config, model_storage, resource)
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> KeywordIntentClassifierGraphComponent: """Loads trained component (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_dir: keyword_file = list(model_dir.glob("**/*.json")) if keyword_file: assert len(keyword_file) == 1 intent_keyword_map = rasa.shared.utils.io.read_json_file( keyword_file[0] ) else: rasa.shared.utils.io.raise_warning( f"Failed to load key word file for " f"`KeywordIntentClassifierGraphComponent`, " f"maybe {keyword_file} does not exist?" ) intent_keyword_map = None except ValueError: logger.warning( f"Failed to load {cls.__class__.__name__} from model storage. Resource " f"'{resource.name}' doesn't exist." ) intent_keyword_map = None return cls( config, model_storage, resource, execution_context, intent_keyword_map, )
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> CountVectorsFeaturizer: """Loads trained component (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_dir: featurizer_file = model_dir / "vocabularies.pkl" vocabulary = io_utils.json_unpickle(featurizer_file) share_vocabulary = config["use_shared_vocab"] if share_vocabulary: vectorizers = cls._create_shared_vocab_vectorizers( config, vocabulary=vocabulary ) else: vectorizers = cls._create_independent_vocab_vectorizers( config, vocabulary=vocabulary ) oov_words = rasa.shared.utils.io.read_json_file( model_dir / "oov_words.json" ) ftr = cls( config, model_storage, resource, execution_context, vectorizers=vectorizers, oov_token=config["OOV_token"], oov_words=oov_words, ) # make sure the vocabulary has been loaded correctly for attribute in vectorizers: ftr.vectorizers[attribute]._validate_vocabulary() return ftr except (ValueError, FileNotFoundError, FileIOException): logger.debug( f"Failed to load `{cls.__class__.__name__}` from model storage. " f"Resource '{resource.name}' doesn't exist." ) return cls( config=config, model_storage=model_storage, resource=resource, execution_context=execution_context, )
def to_cache(self, directory: Path, model_storage: ModelStorage) -> None: """Persists the `Resource` to the cache. Args: directory: The directory which receives the persisted `Resource`. model_storage: The model storage which currently contains the persisted `Resource`. """ with model_storage.read_from(self) as resource_directory: rasa.utils.common.copy_directory(resource_directory, directory)
async def test_train_model_not_checkpointing( default_model_storage: ModelStorage, default_diet_resource: Resource, create_train_load_and_process_diet: Callable[..., Message], ): create_train_load_and_process_diet({EPOCHS: 1, CHECKPOINT_MODEL: False}) with default_model_storage.read_from(default_diet_resource) as model_dir: all_files = list(model_dir.rglob("*.*")) assert not any( ["from_checkpoint" in str(filename) for filename in all_files])
async def test_train_model_not_checkpointing( default_model_storage: ModelStorage, default_diet_resource: Resource, create_train_load_and_process_diet: Callable[..., Message], ): create_train_load_and_process_diet({EPOCHS: 2, CHECKPOINT_MODEL: False}) with default_model_storage.read_from(default_diet_resource) as model_dir: checkpoint_dir = model_dir / "checkpoints" assert not checkpoint_dir.is_dir()
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> DomainProvider: """Creates provider using a persisted version of itself.""" with model_storage.read_from(resource) as resource_directory: domain = Domain.from_path(resource_directory) return cls(model_storage, resource, domain)
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> PersistableTestComponent: assert model_storage assert resource with model_storage.read_from(resource) as directory: eager_instantiated_value = rasa.shared.utils.io.read_json_file( directory / "test.json") return cls(config, model_storage, resource, eager_instantiated_value)