def test_use_shared_vocab_exception( initial_train_text: Text, additional_train_text: Text, use_shared_vocab: bool, tmp_path: Path, ): """Tests if an exception is raised when `use_shared_vocab` is set to True during incremental training.""" tk = WhitespaceTokenizer() initial_cvf = CountVectorsFeaturizer( component_config={"use_shared_vocab": use_shared_vocab} ) train_message = Message(data={"text": initial_train_text}) data = TrainingData([train_message]) tk.train(data) initial_cvf.train(data) file_dict = initial_cvf.persist("ftr", tmp_path) meta = initial_cvf.component_config.copy() meta.update(file_dict) new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True) additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) tk.train(data) if use_shared_vocab: with pytest.raises(Exception) as exec_info: new_cvf.train(data) assert ( "Using a shared vocabulary in `CountVectorsFeaturizer` is not supported" in str(exec_info.value) ) else: new_cvf.train(data)
def test_cvf_incremental_train_vocabulary_overflow(tmp_path: Path, ): additional_size = 3 original_train_text = "hello my name is John." additional_train_text = "I am also new." tokenizer = WhitespaceTokenizer() original_featurizer = CountVectorsFeaturizer( {"additional_vocabulary_size": { "text": additional_size }}, finetune_mode=False, ) train_message = Message(data={"text": original_train_text}) data = TrainingData([train_message]) tokenizer.train(data) original_featurizer.train(data) file_dict = original_featurizer.persist("ftr", str(tmp_path)) # load original_featurizer meta = original_featurizer.component_config.copy() meta.update(file_dict) new_featurizer = CountVectorsFeaturizer.load(meta, str(tmp_path), should_finetune=True) additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) tokenizer.train(data) with pytest.warns(UserWarning) as warning: new_featurizer.train(data) assert "New data contains vocabulary of size" in warning[0].message.args[0]
def test_cvf_incremental_train_vocabulary( additional_size: Optional[int], original_train_text: Text, additional_train_text: Text, total_vocabulary_size: int, remaining_buffer_size: int, tmp_path: Path, ): tokenizer = WhitespaceTokenizer() original_featurizer = CountVectorsFeaturizer( {"additional_vocabulary_size": { "text": additional_size }}, finetune_mode=False, ) train_message = Message(data={"text": original_train_text}) data = TrainingData([train_message]) tokenizer.train(data) original_featurizer.train(data) # Check total vocabulary size with buffer slots before finetuning original_vocabulary = original_featurizer.vectorizers["text"].vocabulary_ assert len(original_vocabulary) == total_vocabulary_size file_dict = original_featurizer.persist("ftr", str(tmp_path)) # load original_featurizer meta = original_featurizer.component_config.copy() meta.update(file_dict) new_featurizer = CountVectorsFeaturizer.load(meta, str(tmp_path), should_finetune=True) # Check total vocabulary size with buffer slots before finetuning assert len(new_featurizer.vectorizers["text"].vocabulary_ ) == total_vocabulary_size additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) tokenizer.train(data) new_featurizer.train(data) new_vocabulary = new_featurizer.vectorizers["text"].vocabulary_ # Check total vocabulary size with buffer slots after finetuning assert len(new_vocabulary) == total_vocabulary_size # Check remaining buffer slots after finetuning assert (len(new_vocabulary) - new_featurizer._get_starting_empty_index(new_vocabulary) == remaining_buffer_size) # Check indices of original vocabulary haven't changed in the new vocabulary for vocab_token, vocab_index in original_vocabulary.items(): if not vocab_token.startswith("buf_"): assert vocab_token in new_vocabulary assert new_vocabulary.get(vocab_token) == vocab_index
def inner( config: Optional[Dict[Text, Any]] = None, is_finetuning: bool = False ) -> CountVectorsFeaturizer: config = config or {} return CountVectorsFeaturizer.load( {**CountVectorsFeaturizer.get_default_config(), **config}, default_model_storage, Resource("count_vectors_featurizer"), dataclasses.replace(default_execution_context, is_finetuning=is_finetuning), )
def test_cvf_incremental_training( initial_train_text: Text, additional_train_text: Text, initial_vocabulary_size: int, final_vocabulary_size: int, tmp_path: Path, ): tk = WhitespaceTokenizer() initial_cvf = CountVectorsFeaturizer() train_message = Message(data={"text": initial_train_text}) data = TrainingData([train_message]) tk.train(data) initial_cvf.train(data) # Check initial vocabulary size initial_vocab = initial_cvf.vectorizers["text"].vocabulary_ assert len(initial_vocab) == initial_vocabulary_size # persist and load initial cvf file_dict = initial_cvf.persist("ftr", tmp_path) meta = initial_cvf.component_config.copy() meta.update(file_dict) new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True) # Check vocabulary size again assert len(new_cvf.vectorizers["text"].vocabulary_) == initial_vocabulary_size additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) tk.train(data) new_cvf.train(data) new_vocab = new_cvf.vectorizers["text"].vocabulary_ # Check vocabulary size after finetuning assert len(new_vocab) == final_vocabulary_size # Check indices of initial vocabulary haven't changed in the new vocabulary for vocab_token, vocab_index in initial_vocab.items(): assert vocab_token in new_vocab assert new_vocab.get(vocab_token) == vocab_index
def test_count_vector_featurizer_persist_load(tmp_path): # set non default values to config config = { "analyzer": "char", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, "max_df": 3, "min_ngram": 2, "max_ngram": 3, "max_features": 10, "lowercase": False, } train_ftr = CountVectorsFeaturizer(config) sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà" sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà" train_message1 = Message(sentence1) train_message2 = Message(sentence2) data = TrainingData([train_message1, train_message2]) train_ftr.train(data) # persist featurizer file_dict = train_ftr.persist("ftr", str(tmp_path)) train_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in train_ftr.vectorizers.items() } # add trained vocabulary to vectorizer params for attribute, attribute_vect_params in train_vect_params.items(): if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"): train_vect_params[attribute].update( {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_}) # load featurizer meta = train_ftr.component_config.copy() meta.update(file_dict) test_ftr = CountVectorsFeaturizer.load(meta, str(tmp_path)) test_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in test_ftr.vectorizers.items() } assert train_vect_params == test_vect_params # check if vocaculary was loaded correctly assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_") test_message1 = Message(sentence1) test_ftr.process(test_message1) test_message2 = Message(sentence2) test_ftr.process(test_message2) test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features( TEXT, []) train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features( TEXT, []) test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features( TEXT, []) train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features( TEXT, []) # check that train features and test features after loading are the same assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray()) assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray()) assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray()) assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())
def test_count_vector_featurizer_persist_load(tmpdir): # set non default values to config config = { "analyzer": "char", "token_pattern": r"(?u)\b\w+\b", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, "max_df": 3, "min_ngram": 2, "max_ngram": 3, "max_features": 10, "lowercase": False, } train_ftr = CountVectorsFeaturizer(config) sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà" sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà" train_message1 = Message(sentence1) train_message2 = Message(sentence2) data = TrainingData([train_message1, train_message2]) train_ftr.train(data) # persist featurizer file_dict = train_ftr.persist("ftr", tmpdir.strpath) train_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in train_ftr.vectorizers.items() } # add trained vocabulary to vectorizer params for attribute, attribute_vect_params in train_vect_params.items(): if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"): train_vect_params[attribute].update( {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_}) # load featurizer meta = train_ftr.component_config.copy() meta.update(file_dict) test_ftr = CountVectorsFeaturizer.load(meta, tmpdir.strpath) test_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in test_ftr.vectorizers.items() } assert train_vect_params == test_vect_params test_message1 = Message(sentence1) test_ftr.process(test_message1) test_message2 = Message(sentence2) test_ftr.process(test_message2) # check that train features and test features after loading are the same assert np.all([ train_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray() == test_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(), train_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray() == test_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(), ])