def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> SklearnIntentClassifier: """Loads trained component (see parent class for full docstring).""" from sklearn.preprocessing import LabelEncoder try: with model_storage.read_from(resource) as model_dir: file_name = cls.__name__ classifier_file = model_dir / f"{file_name}_classifier.pkl" if classifier_file.exists(): classifier = io_utils.json_unpickle(classifier_file) encoder_file = model_dir / f"{file_name}_encoder.pkl" classes = io_utils.json_unpickle(encoder_file) encoder = LabelEncoder() encoder.classes_ = classes return cls(config, model_storage, resource, classifier, encoder) except ValueError: logger.debug( f"Failed to load '{cls.__name__}' from model storage. Resource " f"'{resource.name}' doesn't exist.") return cls(config, model_storage, resource)
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> MitieIntentClassifierGraphComponent: """Loads component for inference see parent class for full docstring).""" import mitie text_categorizer = None try: with model_storage.read_from(resource) as directory: text_categorizer = mitie.text_categorizer( str(directory / "model.dat")) except ( ValueError, Exception, ): # the latter is thrown by the `mitie.text_categorizer` logger.warning( f"Failed to load {cls.__class__.__name__} from model storage. Resource " f"'{resource.name}' doesn't exist.") return cls(config, model_storage, resource, text_categorizer)
def test_writing_to_resource_during_training( default_model_storage: ModelStorage): node_name = "some_name" test_value_for_sub_directory = {"test": "test value sub dir"} test_value = {"test dir": "test value dir"} node = GraphNode( node_name=node_name, component_class=PersistableTestComponent, constructor_name="create", component_config={ "test_value": test_value, "test_value_for_sub_directory": test_value_for_sub_directory, }, fn_name="train", inputs={}, eager=False, model_storage=default_model_storage, resource=None, execution_context=ExecutionContext(GraphSchema({}), "123"), ) _, resource = node() assert resource == Resource(node_name) with default_model_storage.read_from(resource) as directory: assert (rasa.shared.utils.io.read_json_file(directory / "test.json") == test_value) assert (rasa.shared.utils.io.read_json_file( directory / "sub_dir" / "test.json") == test_value_for_sub_directory)
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> CountVectorsFeaturizer: """Loads trained component (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_dir: featurizer_file = model_dir / "vocabularies.pkl" vocabulary = io_utils.json_unpickle(featurizer_file) share_vocabulary = config["use_shared_vocab"] if share_vocabulary: vectorizers = cls._create_shared_vocab_vectorizers( config, vocabulary=vocabulary ) else: vectorizers = cls._create_independent_vocab_vectorizers( config, vocabulary=vocabulary ) oov_words = rasa.shared.utils.io.read_json_file( model_dir / "oov_words.json" ) ftr = cls( config, model_storage, resource, execution_context, vectorizers=vectorizers, oov_token=config["OOV_token"], oov_words=oov_words, ) # make sure the vocabulary has been loaded correctly for attribute in vectorizers: ftr.vectorizers[attribute]._validate_vocabulary() return ftr except (ValueError, FileNotFoundError, FileIOException): logger.debug( f"Failed to load `{cls.__class__.__name__}` from model storage. " f"Resource '{resource.name}' doesn't exist." ) return cls( config=config, model_storage=model_storage, resource=resource, execution_context=execution_context, )
def test_read_from_not_existing_resource(default_model_storage: ModelStorage): with default_model_storage.write_to( Resource("resource1")) as temporary_directory: file = temporary_directory / "file.txt" file.write_text("test") with pytest.raises(ValueError): with default_model_storage.read_from( Resource("a different resource")) as _: pass
def to_cache(self, directory: Path, model_storage: ModelStorage) -> None: """Persists the `Resource` to the cache. Args: directory: The directory which receives the persisted `Resource`. model_storage: The model storage which currently contains the persisted `Resource`. """ with model_storage.read_from(self) as resource_directory: rasa.utils.common.copy_directory(resource_directory, directory)
async def test_train_model_not_checkpointing( default_model_storage: ModelStorage, default_diet_resource: Resource, create_train_load_and_process_diet: Callable[..., Message], ): create_train_load_and_process_diet({EPOCHS: 1, CHECKPOINT_MODEL: False}) with default_model_storage.read_from(default_diet_resource) as model_dir: all_files = list(model_dir.rglob("*.*")) assert not any( ["from_checkpoint" in str(filename) for filename in all_files])
async def test_train_model_not_checkpointing( default_model_storage: ModelStorage, default_diet_resource: Resource, create_train_load_and_process_diet: Callable[..., Message], ): create_train_load_and_process_diet({EPOCHS: 2, CHECKPOINT_MODEL: False}) with default_model_storage.read_from(default_diet_resource) as model_dir: checkpoint_dir = model_dir / "checkpoints" assert not checkpoint_dir.is_dir()
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> DomainProvider: """Creates provider using a persisted version of itself.""" with model_storage.read_from(resource) as resource_directory: domain = Domain.from_path(resource_directory) return cls(model_storage, resource, domain)
def test_caching_empty_resource( default_model_storage: ModelStorage, tmp_path: Path, tmp_path_factory: TempPathFactory, ): resource_name = "my resource" resource = Resource(resource_name) # does not raise resource.to_cache(tmp_path, default_model_storage) with pytest.raises(ValueError): with default_model_storage.read_from(resource) as _: pass cache_dir = tmp_path_factory.mktemp("cache_dir") # this doesn't create an empty directory in `default_model_storage` Resource.from_cache(resource_name, cache_dir, default_model_storage, resource.output_fingerprint) with pytest.raises(ValueError): with default_model_storage.read_from(resource) as _: pass
def to_cache(self, directory: Path, model_storage: ModelStorage) -> None: """Persists the `Resource` to the cache. Args: directory: The directory which receives the persisted `Resource`. model_storage: The model storage which currently contains the persisted `Resource`. """ try: with model_storage.read_from(self) as resource_directory: rasa.utils.common.copy_directory(resource_directory, directory) except ValueError: logger.debug( f"Skipped caching resource '{self.name}' as no persisted " f"data was found.")
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> PersistableTestComponent: assert model_storage assert resource with model_storage.read_from(resource) as directory: eager_instantiated_value = rasa.shared.utils.io.read_json_file( directory / "test.json") return cls(config, model_storage, resource, eager_instantiated_value)
def test_train_model_checkpointing( create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector], default_model_storage: ModelStorage, train_and_preprocess: Callable[..., Tuple[TrainingData, List[GraphComponent]]], ): pipeline = [ { "component": WhitespaceTokenizer }, { "component": CountVectorsFeaturizer, "analyzer": "char_wb", "min_ngram": 3, "max_ngram": 17, "max_features": 10, "min_df": 5, }, ] training_data, loaded_pipeline = train_and_preprocess( pipeline, "data/test_selectors") config_params = { EPOCHS: 5, MODEL_CONFIDENCE: "softmax", CONSTRAIN_SIMILARITIES: True, CHECKPOINT_MODEL: True, EVAL_NUM_EPOCHS: 1, EVAL_NUM_EXAMPLES: 10, } response_selector = create_response_selector(config_params) assert response_selector.component_config[CHECKPOINT_MODEL] resource = response_selector.train(training_data=training_data) with default_model_storage.read_from(resource) as model_dir: checkpoint_dir = model_dir / "checkpoints" assert checkpoint_dir.is_dir() checkpoint_files = list(checkpoint_dir.rglob("*.*")) """ there should be min 2 `tf_model` files in the `checkpoints` directory: - tf_model.data - tf_model.index """ assert len(checkpoint_files) >= 2
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, ) -> GraphComponent: """Loads trained component from disk.""" try: with model_storage.read_from(resource) as model_dir: tfidfvectorizer = load(model_dir / "tfidfvectorizer.joblib") component = cls(config, execution_context.node_name, model_storage, resource) component.tfm = tfidfvectorizer except (ValueError, FileNotFoundError): logger.debug( f"Couldn't load metadata for component '{cls.__name__}' as the persisted " f"model data couldn't be loaded.") return component
def create( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, ) -> RuleOnlyDataProvider: """Creates component (see parent class for docstring).""" rule_only_data = {} try: with model_storage.read_from(resource) as directory: rule_only_data = rasa.shared.utils.io.read_json_file( directory / "rule_only_data.json") except ValueError: logger.debug( "Failed to load rule-only data from a trained 'RulePolicy'. " "Providing empty rule-only data instead.") return cls(rule_only_data)
def test_train_model_checkpointing( create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector], default_model_storage: ModelStorage, train_and_preprocess: Callable[..., Tuple[TrainingData, List[GraphComponent]]], ): pipeline = [ { "component": WhitespaceTokenizer }, { "component": CountVectorsFeaturizer, "analyzer": "char_wb", "min_ngram": 3, "max_ngram": 17, "max_features": 10, "min_df": 5, }, ] training_data, loaded_pipeline = train_and_preprocess( pipeline, "data/test_selectors") config_params = { EPOCHS: 2, MODEL_CONFIDENCE: "softmax", CONSTRAIN_SIMILARITIES: True, CHECKPOINT_MODEL: True, EVAL_NUM_EPOCHS: 1, EVAL_NUM_EXAMPLES: 10, } response_selector = create_response_selector(config_params) assert response_selector.component_config[CHECKPOINT_MODEL] resource = response_selector.train(training_data=training_data) with default_model_storage.read_from(resource) as model_dir: all_files = list(model_dir.rglob("*.*")) assert any( ["from_checkpoint" in str(filename) for filename in all_files])
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> GraphComponent: """Loads trained component (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_dir: classifier = joblib.load(model_dir / f"{resource.name}.joblib") component = cls(config, execution_context.node_name, model_storage, resource) component.clf = classifier return component except ValueError: logger.debug( f"Failed to load {cls.__class__.__name__} from model storage. Resource " f"'{resource.name}' doesn't exist.") return cls(config, model_storage, resource, execution_context)
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> KeywordIntentClassifier: """Loads trained component (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_dir: keyword_file = model_dir / f"{cls.__name__}.json" intent_keyword_map = rasa.shared.utils.io.read_json_file( keyword_file) except ValueError: logger.warning( f"Failed to load {cls.__class__.__name__} from model storage. Resource " f"'{resource.name}' doesn't exist.") intent_keyword_map = None return cls(config, model_storage, resource, execution_context, intent_keyword_map)
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> JiebaTokenizerGraphComponent: """Loads a custom dictionary from model storage.""" dictionary_path = config["dictionary_path"] # If a custom dictionary path is in the config we know that it should have # been saved to the model storage. if dictionary_path is not None: try: with model_storage.read_from(resource) as resource_directory: cls._load_custom_dictionary(str(resource_directory)) except ValueError: logger.warning( f"Failed to load {cls.__name__} from model storage. " f"Resource '{resource.name}' doesn't exist.") return cls(config, model_storage, resource)
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> CRFEntityExtractor: """Loads trained component (see parent class for full docstring).""" import joblib try: entity_taggers = OrderedDict() with model_storage.read_from(resource) as model_dir: # We have to load in the same order as we persisted things as otherwise # the predictions might be off file_names = sorted(model_dir.glob("**/*.pkl")) if not file_names: logger.debug( "Failed to load model for 'CRFEntityExtractor'. " "Maybe you did not provide enough training data and " "no model was trained." ) return cls(config, model_storage, resource) for file_name in file_names: name = file_name.stem[1:] entity_taggers[name] = joblib.load(file_name) return cls(config, model_storage, resource, entity_taggers) except ValueError: logger.warning( f"Failed to load {cls.__name__} from model storage. Resource " f"'{resource.name}' doesn't exist." ) return cls(config, model_storage, resource)
async def test_train_model_checkpointing( default_model_storage: ModelStorage, default_diet_resource: Resource, create_train_load_and_process_diet: Callable[..., Message], ): create_train_load_and_process_diet( {EPOCHS: 2, EVAL_NUM_EPOCHS: 1, EVAL_NUM_EXAMPLES: 10, CHECKPOINT_MODEL: True}, ) with default_model_storage.read_from(default_diet_resource) as model_dir: checkpoint_dir = model_dir / "checkpoints" assert checkpoint_dir.is_dir() """ Tricky to validate the *exact* number of files that should be there, however there must be at least the following: - metadata.json - checkpoint - component_1_CountVectorsFeaturizer (as per the pipeline above) - component_2_DIETClassifier files (more than 1 file) """ all_files = list(model_dir.rglob("*.*")) assert len(all_files) > 4
def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> GraphComponent: """Loads a `FineTuningValidator` (see parent class for full docstring).""" try: with model_storage.read_from(resource) as path: fingerprints = rasa.shared.utils.io.read_json_file( filename=path / cls.FILENAME, ) return cls( model_storage=model_storage, execution_context=execution_context, resource=resource, fingerprints=fingerprints, ) except ValueError as e: raise InvalidConfigException( f"Loading {cls.__name__} failed. Ensure that the {cls.__name__} " f"is part of your training graph and re-train your models before " f"attempting to use the {cls.__name__}.") from e
def test_jieba_load_and_persist_dictionary( tmp_path_factory: TempPathFactory, default_model_storage: ModelStorage, default_execution_context: ExecutionContext, caplog: LogCaptureFixture, ): dictionary_directory = tmp_path_factory.mktemp("dictionaries") dictionary_path = dictionary_directory / "dictionary_1" dictionary_contents = """ 创新办 3 i 云计算 5 凱特琳 nz 台中 """ dictionary_path.write_text(dictionary_contents, encoding="utf-8") component_config = {"dictionary_path": dictionary_directory} resource = Resource("jieba") tk = JiebaTokenizerGraphComponent.create( { **JiebaTokenizerGraphComponent.get_default_config(), **component_config }, default_model_storage, resource, default_execution_context, ) tk.process_training_data(TrainingData([Message(data={TEXT: ""})])) # The dictionary has not been persisted yet. with caplog.at_level(logging.WARN): JiebaTokenizerGraphComponent.load( { **JiebaTokenizerGraphComponent.get_default_config(), **component_config }, default_model_storage, resource, default_execution_context, ) assert any( "Failed to load JiebaTokenizerGraphComponent from model storage." in message for message in caplog.messages) tk.persist() # Check the persisted dictionary matches the original file. with default_model_storage.read_from(resource) as resource_dir: contents = (resource_dir / "dictionary_1").read_text(encoding="utf-8") assert contents == dictionary_contents # Delete original files to show that we read from the model storage. dictionary_path.unlink() dictionary_directory.rmdir() JiebaTokenizerGraphComponent.load( { **JiebaTokenizerGraphComponent.get_default_config(), **component_config }, default_model_storage, resource, default_execution_context, ) tk.process([Message(data={TEXT: ""})])
def test_nlu_training_data_provider( default_model_storage: ModelStorage, default_execution_context: ExecutionContext, config_path: Text, nlu_data_path: Text, ): # create a resource and an importer resource = Resource("xy") importer = TrainingDataImporter.load_from_config( config_path=config_path, training_data_paths=[nlu_data_path]) # check the default configuration is as expected config_1 = NLUTrainingDataProvider.get_default_config() assert config_1["language"] is None assert config_1["persist"] is False # create a provider with persist == True provider_1 = NLUTrainingDataProvider.create( { "language": "en", "persist": True }, default_model_storage, resource, default_execution_context, ) assert isinstance(provider_1, NLUTrainingDataProvider) # check the data provided is as expected data_0 = provider_1.provide(importer) data_1 = importer.get_nlu_data(language="en") assert data_0.fingerprint() == data_1.fingerprint() # check the data was persisted with default_model_storage.read_from(resource) as resource_directory: data_file = os.path.join(str(resource_directory), DEFAULT_TRAINING_DATA_OUTPUT_PATH) data = load_data(resource_name=data_file, language="en") assert os.path.isfile(data_file) assert isinstance(data, TrainingData) # delete the persisted data os.remove(data_file) assert not os.path.isfile(data_file) # create a provider with persist == False provider_2 = NLUTrainingDataProvider.create( { "language": "en", "persist": False }, default_model_storage, resource, default_execution_context, ) provider_2.provide(importer) # check the data was not persisted with default_model_storage.read_from(resource) as resource_directory: data_file = os.path.join(str(resource_directory), DEFAULT_TRAINING_DATA_OUTPUT_PATH) assert not os.path.isfile(data_file)