def test_char_lstm(self, resource_loader): """Tests that a fit succeeds""" config = { "model_type": "text", "example_type": QUERY_EXAMPLE_TYPE, "label_type": CLASS_LABEL_TYPE, "model_settings": { "classifier_type": "lstm" }, "params": { "emb_dim": 30, "tokenizer_type": "char-tokenizer" }, } examples = self.labeled_data.queries() labels = self.labeled_data.intents() model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] config = { **config, "params": { **config["params"], "add_terminals": "True" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"]
def test_default_embedder(self, resource_loader): """Tests that a fit succeeds w/ and w/o crf layer""" config = { "model_type": "tagger", "example_type": ENTITY_EXAMPLE_TYPE, "label_type": ENTITIES_LABEL_TYPE, "model_settings": { "classifier_type": "embedder" }, "params": { "emb_dim": 5 }, } model = ModelFactory.create_model_from_config(ModelConfig(**config)) examples = self.labeled_data.queries() labels = self.labeled_data.entities() model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model) config = { **config, "params": { **config["params"], "use_crf_layer": False } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model)
def test_char_embedder(self, resource_loader): """Tests that a fit succeeds w/ and w/o crf layer""" config = { "model_type": "tagger", "example_type": ENTITY_EXAMPLE_TYPE, "label_type": ENTITIES_LABEL_TYPE, "model_settings": { "classifier_type": "embedder" }, "params": { # default token_spans_pooling_type is "first" "emb_dim": 30, "tokenizer_type": "char-tokenizer" }, } examples = self.labeled_data.queries() labels = self.labeled_data.entities() model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model) config = { **config, "params": { **config["params"], "add_terminals": "True" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model) config = { **config, "params": { **config["params"], "token_spans_pooling_type": "mean" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model) config = { **config, "params": { **config["params"], "use_crf_layer": False } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model)
def test_fit(self, resource_loader): """Tests that a basic fit succeeds""" config = ModelConfig( **{ 'model_type': 'text', 'example_type': QUERY_EXAMPLE_TYPE, 'label_type': CLASS_LABEL_TYPE, 'model_settings': { 'classifier_type': 'logreg' }, 'params': { 'fit_intercept': True, 'C': 100 }, 'features': { 'bag-of-words': { 'lengths': [1] }, 'freq': { 'bins': 5 }, 'length': {} } }) model = TextModel(config) examples = [q.query for q in self.labeled_data] labels = [q.intent for q in self.labeled_data] model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model._current_params == {'fit_intercept': True, 'C': 100}
def test_extract_features(self, resource_loader): """Tests extracted features after a fit""" config = ModelConfig( **{ 'model_type': 'text', 'example_type': QUERY_EXAMPLE_TYPE, 'label_type': CLASS_LABEL_TYPE, 'model_settings': { 'classifier_type': 'logreg' }, 'params': { 'fit_intercept': True, 'C': 100 }, 'features': { 'bag-of-words': { 'lengths': [1] }, } }) model = TextModel(config) examples = [q.query for q in self.labeled_data] labels = [q.intent for q in self.labeled_data] model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) expected_features = { 'bag_of_words|length:1|ngram:hi': 1, 'bag_of_words|length:1|ngram:there': 1 } extracted_features = model.view_extracted_features( markup.load_query('hi there').query) assert extracted_features == expected_features
def test_fit_cv(self, resource_loader): """Tests fitting with param selection""" config = ModelConfig( **{ "model_type": "text", "example_type": QUERY_EXAMPLE_TYPE, "label_type": CLASS_LABEL_TYPE, "model_settings": {"classifier_type": "logreg"}, "param_selection": { "type": "k-fold", "k": 10, "grid": {"C": [10, 100, 1000], "fit_intercept": [True, False]}, }, "features": { "bag-of-words": {"lengths": [1]}, "freq": {"bins": 5}, "length": {}, }, } ) model = TextModel(config) examples = self.labeled_data.queries() labels = self.labeled_data.intents() model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model._current_params
def test_fit_cv(self, resource_loader): """Tests fitting with param selection""" config = ModelConfig( **{ 'model_type': 'text', 'example_type': QUERY_EXAMPLE_TYPE, 'label_type': CLASS_LABEL_TYPE, 'model_settings': { 'classifier_type': 'logreg' }, 'param_selection': { 'type': 'k-fold', 'k': 10, 'grid': { 'C': [10, 100, 1000], 'fit_intercept': [True, False] }, }, 'features': { 'bag-of-words': { 'lengths': [1] }, 'freq': { 'bins': 5 }, 'length': {} } }) model = TextModel(config) examples = [q.query for q in self.labeled_data] labels = [q.intent for q in self.labeled_data] model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model._current_params
def test_fit(self, resource_loader): """Tests that a basic fit succeeds""" config = ModelConfig( **{ "model_type": "text", "example_type": QUERY_EXAMPLE_TYPE, "label_type": CLASS_LABEL_TYPE, "model_settings": { "classifier_type": "logreg" }, "params": { "fit_intercept": True, "C": 100 }, "features": { "bag-of-words": { "lengths": [1] }, "freq": { "bins": 5 }, "length": {}, }, }) model = TextModel(config) examples = self.labeled_data.queries() labels = self.labeled_data.intents() model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model._current_params == {"fit_intercept": True, "C": 100}
def test_extract_features(self, resource_loader): """Tests extracted features after a fit""" config = ModelConfig( **{ "model_type": "text", "example_type": QUERY_EXAMPLE_TYPE, "label_type": CLASS_LABEL_TYPE, "model_settings": {"classifier_type": "logreg"}, "params": {"fit_intercept": True, "C": 100}, "features": { "bag-of-words": {"lengths": [1]}, }, } ) model = TextModel(config) examples = self.labeled_data.queries() labels = self.labeled_data.intents() model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) expected_features = { "bag_of_words|length:1|ngram:hi": 1, "bag_of_words|length:1|ngram:OOV": 1, } extracted_features = model.view_extracted_features( markup.load_query("hi there").query ) assert extracted_features == expected_features
def test_fit_predict(self, resource_loader): """Tests prediction after a fit""" config = ModelConfig( **{ 'model_type': 'text', 'example_type': QUERY_EXAMPLE_TYPE, 'label_type': CLASS_LABEL_TYPE, 'model_settings': { 'classifier_type': 'logreg' }, 'params': { 'fit_intercept': True, 'C': 100 }, 'features': { 'bag-of-words': { 'lengths': [1] }, 'freq': { 'bins': 5 }, 'length': {} } }) model = TextModel(config) examples = [q.query for q in self.labeled_data] labels = [q.intent for q in self.labeled_data] model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query('hi').query]) == 'greet' assert model.predict([markup.load_query('bye').query]) == 'exit'
def test_fit_predict(self, resource_loader): """Tests prediction after a fit""" config = ModelConfig( **{ "model_type": "text", "example_type": QUERY_EXAMPLE_TYPE, "label_type": CLASS_LABEL_TYPE, "model_settings": { "classifier_type": "logreg" }, "params": { "fit_intercept": True, "C": 100 }, "features": { "bag-of-words": { "lengths": [1] }, "freq": { "bins": 5 }, "length": {}, }, }) model = TextModel(config) examples = self.labeled_data.queries() labels = self.labeled_data.intents() model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query]) == "greet" assert model.predict([markup.load_query("bye").query]) == "exit"
def test_default_embedder(self, resource_loader): """Tests that a fit succeeds""" config = { "model_type": "text", "example_type": QUERY_EXAMPLE_TYPE, "label_type": CLASS_LABEL_TYPE, "model_settings": { "classifier_type": "embedder" }, "params": { "emb_dim": 5 }, # default embedder_output_pooling_type is "mean" } examples = self.labeled_data.queries() labels = self.labeled_data.intents() model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] config = { **config, "params": { **config["params"], "embedder_output_pooling_type": "first" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] config = { **config, "params": { **config["params"], "embedder_output_pooling_type": "last" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"]
def test_create_model_from_helpers_without_input_type(): config = { "model_type": "tagger", "model_settings": { "classifier_type": "lstm-pytorch" }, "params": {}, } with pytest.raises(TypeError): model = create_model(config=ModelConfig(**config)) del model
def test_create_model_from_helpers(): config = { "model_type": "tagger", "example_type": ENTITY_EXAMPLE_TYPE, "label_type": ENTITIES_LABEL_TYPE, "model_settings": { "classifier_type": "lstm-pytorch" }, "params": {}, } model = create_model(config=ModelConfig(**config)) assert isinstance(model, PytorchTaggerModel)
def test_create_model_from_config_object(): config = { "model_type": "text", "example_type": QUERY_EXAMPLE_TYPE, "label_type": CLASS_LABEL_TYPE, "model_settings": { "classifier_type": "lstm" }, } model = ModelFactory.create_model_from_config(model_config=ModelConfig( **config)) assert isinstance(model, PytorchTextModel)
def test_bert_embedder_frozen_params(self, resource_loader): """Tests that a fit succeeds""" config = { "model_type": "text", "example_type": QUERY_EXAMPLE_TYPE, "label_type": CLASS_LABEL_TYPE, "model_settings": { "classifier_type": "embedder" }, "params": { # default embedder_output_pooling_type for bert is "first" "embedder_type": "bert", "pretrained_model_name_or_path": "distilbert-base-uncased", "embedder_output_pooling_type": "mean", "update_embeddings": False }, } examples = self.labeled_data.queries() labels = self.labeled_data.intents() # fit the model model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) # assert only some weights are trainable clf = model._clf n_requires_grad, n_total = get_num_weights_of_model(clf) assert n_requires_grad < n_total, print(n_requires_grad, n_total) # check if dumping and loading partial state dict logs required messages & throws no errors os.makedirs(GENERATED_TMP_FOLDER, exist_ok=True) clf.dump(GENERATED_TMP_FOLDER) new_clf = clf.load(GENERATED_TMP_FOLDER) shutil.rmtree(GENERATED_TMP_FOLDER) # do predictions with loaded model model._clf = new_clf assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"]
def test_glove_cnn(self, resource_loader): """Tests that a fit succeeds""" config = { "model_type": "text", "example_type": QUERY_EXAMPLE_TYPE, "label_type": CLASS_LABEL_TYPE, "model_settings": { "classifier_type": "cnn" }, "params": { "embedder_type": "glove" }, } examples = self.labeled_data.queries() labels = self.labeled_data.intents() model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"]
def test_bert_lstm(self, resource_loader): """Tests that a fit succeeds""" config = { "model_type": "text", "example_type": QUERY_EXAMPLE_TYPE, "label_type": CLASS_LABEL_TYPE, "model_settings": { "classifier_type": "lstm" }, "params": { "embedder_type": "bert", "pretrained_model_name_or_path": "bert-base-cased" }, } examples = self.labeled_data.queries() labels = self.labeled_data.intents() # To use a embedder_type 'bert', classifier_type must be 'embedder'. with pytest.raises(ValueError): model = ModelFactory.create_model_from_config( ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels)
def test_bert_embedder(self, resource_loader): """Tests that a fit succeeds""" config = { "model_type": "text", "example_type": QUERY_EXAMPLE_TYPE, "label_type": CLASS_LABEL_TYPE, "model_settings": { "classifier_type": "embedder" }, "params": { # default embedder_output_pooling_type for bert is "first" "embedder_type": "bert" }, } examples = self.labeled_data.queries() labels = self.labeled_data.intents() """ test different configurations for bert-base-cased model""" config = { **config, "params": { **config["params"], "pretrained_model_name_or_path": "bert-base-cased" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] config = { **config, "params": { **config["params"], "embedder_output_pooling_type": "mean" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] config = { **config, "params": { **config["params"], "embedder_output_pooling_type": "last" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] config = { **config, "params": { **config["params"], "embedder_output_pooling_type": "max" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] config = { **config, "params": { **config["params"], "embedder_output_pooling_type": "mean_sqrt" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] """ test for different pretrained transformers""" config = { **config, "params": { "pretrained_model_name_or_path": "distilbert-base-uncased", } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] config = { **config, "params": { "pretrained_model_name_or_path": "roberta-base" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] config = { **config, "params": { "pretrained_model_name_or_path": "albert-base-v2" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] config = { **config, "params": { "pretrained_model_name_or_path": "sentence-transformers/all-mpnet-base-v2" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"] config = { **config, "params": { **config["params"], "embedder_output_pooling_type": "mean" } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) assert model.predict([markup.load_query("hi").query ])[0] in ["greet", "exit"]
def test_char_cnn_word_lstm(self, resource_loader): """Tests that a fit succeeds w/ and w/o crf layer""" config = { "model_type": "tagger", "example_type": ENTITY_EXAMPLE_TYPE, "label_type": ENTITIES_LABEL_TYPE, "model_settings": { "classifier_type": "cnn-lstm" }, "params": { "emb_dim": 5 }, } examples = self.labeled_data.queries() labels = self.labeled_data.entities() incorrect_config = { **config, "params": { **config["params"], "add_terminals": True } } with pytest.raises(ValueError): model = ModelFactory.create_model_from_config( ModelConfig(**incorrect_config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) incorrect_config = { **config, "params": { **config["params"], "tokenizer_type": "char-tokenizer" } } with pytest.raises(ValueError): model = ModelFactory.create_model_from_config( ModelConfig(**incorrect_config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) incorrect_config = { **config, "params": { **config["params"], "embedder_type": "bert", "pretrained_model_name_or_path": "bert-base-cased" } } with pytest.raises(ValueError): model = ModelFactory.create_model_from_config( ModelConfig(**incorrect_config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model) config = { **config, "params": { **config["params"], "use_crf_layer": False } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model) glove_config = {**config, "params": {"embedder_type": "glove"}} model = ModelFactory.create_model_from_config( ModelConfig(**glove_config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model)
def test_bert_embedder(self, resource_loader): """Tests that a fit succeeds w/ and w/o crf layer""" config = { "model_type": "tagger", "example_type": ENTITY_EXAMPLE_TYPE, "label_type": ENTITIES_LABEL_TYPE, "model_settings": { "classifier_type": "embedder" }, "params": { "embedder_type": "bert" }, } examples = self.labeled_data.queries() labels = self.labeled_data.entities() config = { **config, "params": { "embedder_type": "bert", "pretrained_model_name_or_path": "bert-base-cased", "add_terminals": True } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model) new_config = { **config, "params": { **config["params"], "token_spans_pooling_type": "mean" } } model = ModelFactory.create_model_from_config( ModelConfig(**new_config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model) """ test for different pretrained transformers""" config = { **config, "params": { "embedder_type": "bert", "pretrained_model_name_or_path": "distilbert-base-uncased", } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model) config = { **config, "params": { "embedder_type": "bert", "pretrained_model_name_or_path": "albert-base-v2", } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model) config = { **config, "params": { "embedder_type": "bert", "pretrained_model_name_or_path": "sentence-transformers/all-mpnet-base-v2", } } model = ModelFactory.create_model_from_config(ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model) config = { **config, "params": { "embedder_type": "bert", "pretrained_model_name_or_path": "roberta-base", } } with pytest.raises(ValueError): model = ModelFactory.create_model_from_config( ModelConfig(**config)) model.initialize_resources(resource_loader, examples, labels) model.fit(examples, labels) model_predictions_assertions(model)