예제 #1
0
    def test_preprocess_and_load_param_file(self):
        # Test that it does nothing to an empty params object
        with tempfile.TemporaryDirectory() as temp_dir:
            empty_param_fp = Path(temp_dir, 'empty')
            empty_params = Params({})
            assert len(empty_params) == 0
            empty_params.to_file(str(empty_param_fp))
            empty_params = AllenNLPModel._preprocess_and_load_param_file(
                empty_param_fp)
            assert len(empty_params) == 0
            assert isinstance(empty_params, Params)

            full_params_fp = Path(temp_dir, 'full')
            full_params = Params({
                'train_data_path': 1,
                'validation_data_path': 1,
                'test_data_path': 1,
                'evaluate_on_test': 1,
                'anything': 2
            })
            assert len(full_params) == 5
            full_params.to_file(str(full_params_fp))
            full_params = AllenNLPModel._preprocess_and_load_param_file(
                full_params_fp)
            assert len(full_params) == 1
            assert full_params['anything'] == 2
예제 #2
0
 def test_predict_sequences(self, batch_size: Optional[int]):
     data = [{
         "text": "The laptop case was great and cover was rubbish"
     }, {
         "text": "Another day at the office"
     }, {
         "text": "The laptop case was great and cover was rubbish"
     }]
     answers = [{
         "sequence_labels": ['O', 'B', 'B', 'O', 'O', 'B', 'O', 'O', 'B'],
         "confidence": [0, 1, 2, 3, 4, 5, 6, 7, 8],
         "text":
         "The laptop case was great and cover was rubbish",
         "tokens":
         "The laptop case was great and cover was rubbish".split()
     }, {
         "sequence_labels": ['O', 'B', 'B', 'O', 'B'],
         "confidence": [0, 1, 2, 3, 4],
         "text": "Another day at the office",
         "tokens": "Another day at the office".split()
     }, {
         "sequence_labels": ['O', 'B', 'B', 'O', 'O', 'B', 'O', 'O', 'B'],
         "confidence": [0, 1, 2, 3, 4, 5, 6, 7, 8],
         "text":
         "The laptop case was great and cover was rubbish",
         "tokens":
         "The laptop case was great and cover was rubbish".split()
     }]
     # Requires the softmax rather than the CRF version as we want the
     # confidence scores that are returned to be greater than
     # 1 / number labels where as in the CRF case it maximses entire
     # sentence level predictions thus the confidence returned can be less
     # than 1 / number labels
     model_dir = self.TARGET_EXTRACTION_SF_MODEL
     model = AllenNLPModel('TE', self.SOFTMAX_CONFIG_FILE, 'target-tagger',
                           model_dir)
     model.load()
     predictions = []
     for index, prediction in enumerate(
             model.predict_sequences(data, batch_size)):
         predictions.append(prediction)
         answer = answers[index]
         assert 4 == len(prediction)
         for key, value in answer.items():
             assert len(value) == len(prediction[key])
             if key != 'confidence':
                 assert value == prediction[key]
             else:
                 for confidence_score in prediction[key]:
                     assert 0.333333 < confidence_score
                     assert 1 > confidence_score
    def test_predict_iter(self):
        data = [{
            "text": "The laptop case was great and cover was rubbish"
        }, {
            "text": "Another day at the office"
        }, {
            "text": "The laptop case was great and cover was rubbish"
        }]
        # Test that it raises an Error when the model attribute is not None
        model_dir = self.TARGET_EXTRACTION_MODEL
        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger',
                              model_dir)
        with pytest.raises(AssertionError):
            for _ in model._predict_iter(data):
                pass
        # Test that it raises an Error when the data provided is not a list or
        # iterable
        model.load()
        non_iter_data = 5
        with pytest.raises(TypeError):
            for _ in model._predict_iter(non_iter_data):
                pass
        # Test that it works on the normal cases which are lists and iterables
        for data_type in [data, iter(data)]:
            predictions = []
            for prediction in model._predict_iter(data_type):
                predictions.append(prediction)
            assert 3 == len(predictions)
            assert isinstance(predictions[0], dict)
            assert 5 == len(predictions[1]['tags'])
            assert 9 == len(predictions[1]['class_probabilities'])

        # Test that it works on a larger dataset of 150
        larger_dataset = data * 50
        for data_type in [larger_dataset, iter(larger_dataset)]:
            predictions = []
            for prediction in model._predict_iter(data_type):
                predictions.append(prediction)
            assert 150 == len(predictions)
            assert isinstance(predictions[0], dict)
            assert 5 == len(predictions[-2]['tags'])
            assert 9 == len(predictions[-2]['class_probabilities'])
            assert 9 == len(predictions[-1]['tags'])
            assert 9 == len(predictions[-1]['class_probabilities'])

        # Test the case when you feed it no data which can happen through
        # multiple iterators e.g.
        alt_data = iter(data)
        # ensure alt_data has no data
        assert 3 == len([d for d in alt_data])
        predictions = []
        for prediction in model._predict_iter(alt_data):
            predictions.append(prediction)
        assert not predictions
예제 #4
0
    def test_predict_into_collection(self, batch_size: Optional[int],
                                     append_if_exists: bool):
        # Test that it raises an Error when the model attribute is not None
        model_dir = self.TARGET_EXTRACTION_MODEL
        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger',
                              model_dir)
        model.load()
        # Test the normal case
        train_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)
        key_mappings = {'tags': 'predicted_tags', 'words': 'predicted_tokens'}
        train_data = model.predict_into_collection(train_data, key_mappings,
                                                   batch_size,
                                                   append_if_exists)
        for target_data in train_data.values():
            assert 'predicted_tags' in target_data
            assert 'tags' not in target_data
            assert 'predicted_tokens' in target_data
            assert 'tokens' not in target_data

            target_tokens = target_data['tokenized_text']
            assert len(target_tokens) == len(target_data['predicted_tags'][0])
            assert len(target_tokens) == len(
                target_data['predicted_tokens'][0])
            assert target_tokens == target_data['predicted_tokens'][0]
        # This should be fine when append_if_exists is True and KeyError other
        # wise.
        if append_if_exists:
            train_data = model.predict_into_collection(train_data,
                                                       key_mappings,
                                                       batch_size,
                                                       append_if_exists)
            for target_data in train_data.values():
                target_tokens = target_data['tokenized_text']
                assert 2 == len(target_data['predicted_tags'])
                assert target_data['predicted_tags'][0] == target_data[
                    'predicted_tags'][1]
                assert target_tokens == target_data['predicted_tokens'][0]
                assert target_tokens == target_data['predicted_tokens'][1]
        else:
            with pytest.raises(KeyError):
                train_data = model.predict_into_collection(
                    train_data, key_mappings, batch_size, append_if_exists)
        # Raise a KeyError when the `key_mappings` values are not within the
        # TargetText
        from collections import OrderedDict
        key_mappings = OrderedDict([('tags', 'predicted_tags'),
                                    ('wordss', 'predicted_tokens')])
        train_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)
        with pytest.raises(KeyError):
            train_data = model.predict_into_collection(train_data,
                                                       key_mappings,
                                                       batch_size,
                                                       append_if_exists)
        for target_data in train_data.values():
            assert 'predicted_tags' not in target_data
            assert 'predicted_tokens' not in target_data
예제 #5
0
    def test_target_extraction_fit(self, test_data: bool):

        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger')
        assert model.model is None

        train_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)
        val_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)

        tokens_in_vocab = [
            'at', 'case', 'was', 'the', 'day', 'great', 'cover', 'office',
            'another', 'and', 'rubbish', 'laptop', '@@PADDING@@', '@@UNKNOWN@@'
        ]
        if test_data:
            tokens_in_vocab = tokens_in_vocab + ['better']
            test_data = TargetTextCollection.load_json(
                self.TARGET_EXTRACTION_TEST_DATA)
            model.fit(train_data, val_data, test_data)
        else:
            model.fit(train_data, val_data)

        token_index = model.model.vocab.get_token_to_index_vocabulary('tokens')
        assert len(token_index) == len(tokens_in_vocab)
        for token in tokens_in_vocab:
            assert token in token_index

        # Check attributes have changed.
        assert model.model is not None
        assert isinstance(model.model, Model)

        # Check that it will save to a directory of our choosing
        with tempfile.TemporaryDirectory() as save_dir:
            saved_model_fp = Path(save_dir, 'model.tar.gz')
            assert not saved_model_fp.exists()
            model = AllenNLPModel('TE',
                                  self.CONFIG_FILE,
                                  'target-tagger',
                                  save_dir=save_dir)
            model.fit(train_data, val_data)
            assert saved_model_fp.exists()
예제 #6
0
    def test_add_dataset_paths(self, test_data: bool):
        # Test the case where the params are empty and it should populate the
        # params
        empty_params = Params({})

        train_fp = Path(__file__, '..', 'models', 'target_tagger_test.py')
        train_str = str(train_fp.resolve())

        val_fp = Path(__file__, '..', 'dataset_readers',
                      'target_extraction_test.py')
        val_str = str(val_fp.resolve())

        test_fp = Path(__file__, '..', 'predictors',
                       'target_tagger_predictor_test.py')
        test_str = str(test_fp.resolve())

        assert len(empty_params) == 0
        if test_data:
            AllenNLPModel._add_dataset_paths(empty_params, train_fp, val_fp,
                                             test_fp)
            assert len(empty_params) == 3
            assert empty_params['train_data_path'] == train_str
            assert empty_params['validation_data_path'] == val_str
            assert empty_params['test_data_path'] == test_str
        else:
            AllenNLPModel._add_dataset_paths(empty_params, train_fp, val_fp)
            assert len(empty_params) == 2
            assert empty_params['train_data_path'] == train_str
            assert empty_params['validation_data_path'] == val_str

        # Test when the params were not empty
        full_params = Params({'train_data_path': 'something', 'another': 1})
        assert len(full_params) == 2
        assert full_params['train_data_path'] == 'something'
        if test_data:
            AllenNLPModel._add_dataset_paths(full_params, train_fp, val_fp,
                                             test_fp)
            assert len(full_params) == 4
            assert full_params['train_data_path'] == train_str
            assert full_params['validation_data_path'] == val_str
            assert full_params['test_data_path'] == test_str
            assert full_params['another'] == 1
        else:
            AllenNLPModel._add_dataset_paths(full_params, train_fp, val_fp)
            assert len(full_params) == 3
            assert full_params['train_data_path'] == train_str
            assert full_params['validation_data_path'] == val_str
            assert full_params['another'] == 1
예제 #7
0
    def test_set_random_seeds(self):
        # test the case where the params is empty
        empty_params = Params({})
        assert len(empty_params) == 0
        AllenNLPModel._set_random_seeds(empty_params)
        assert len(empty_params) == 3
        seed_keys = ["random_seed", "numpy_seed", "pytorch_seed"]
        for key in seed_keys:
            assert isinstance(empty_params[key], int)
            assert empty_params[key] in range(1, 99999)

        # test the case where the param is not empty and contain the seed keys
        original_values = {
            "random_seed": 599999,
            "numpy_seed": 599999,
            "pytorch_seed": 799999
        }
        seed_params = Params(copy.deepcopy(original_values))
        assert len(seed_params) == 3
        AllenNLPModel._set_random_seeds(seed_params)
        for key, value in original_values.items():
            assert value != seed_params[key]
            assert seed_params[key] in range(1, 99999)
예제 #8
0
                        type=parse_path,
                        help='File Path to the Model configuration file')
    parser.add_argument('model_save_dir',
                        type=parse_path,
                        help='Directory to save the trained model')
    parser.add_argument('data_fp',
                        type=parse_path,
                        help='File Path to the data to predict on')
    parser.add_argument('output_data_fp',
                        type=parse_path,
                        help='File Path to the output predictions')
    args = parser.parse_args()

    dataset_name = args.dataset_name
    model_name = f'{dataset_name} model'
    model = AllenNLPModel(model_name, args.model_config, 'target-tagger',
                          args.model_save_dir)

    if dataset_name == 'semeval_2014':
        if not args.train_fp or not args.test_fp:
            raise ValueError('If training and predicting for the SemEval '
                             'datasets the training and test file paths must '
                             'be given')
        # As we are performing target extraction we use the conflict polarity
        # targets like prior work
        train_data = semeval_2014(args.train_fp, conflict=True)
        test_data = semeval_2014(args.test_fp, conflict=True)
    else:
        temp_election_directory = Path('.', 'data', 'twitter_election_dataset')
        train_data = wang_2017_election_twitter_train(temp_election_directory)
        test_data = wang_2017_election_twitter_test(temp_election_directory)
예제 #9
0
    def test_predict_iter(self, batch_size: Optional[int],
                          yield_original_target: bool):
        data = [{
            "text": "The laptop case was great and cover was rubbish"
        }, {
            "text": "Another day at the office"
        }, {
            "text": "The laptop case was great and cover was rubbish"
        }]
        # Test that it raises an Error when the model attribute is not None
        model_dir = self.TARGET_EXTRACTION_MODEL
        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger',
                              model_dir)
        with pytest.raises(AssertionError):
            for _ in model._predict_iter(
                    data,
                    batch_size=batch_size,
                    yield_original_target=yield_original_target):
                pass
        # Test that it raises an Error when the data provided is not a list or
        # iterable
        model.load()
        non_iter_data = 5
        with pytest.raises(TypeError):
            for _ in model._predict_iter(
                    non_iter_data,
                    batch_size=batch_size,
                    yield_original_target=yield_original_target):
                pass
        # Test that it works on the normal cases which are lists and iterables
        for data_type in [data, iter(data)]:
            predictions = []
            for prediction in model._predict_iter(
                    data_type,
                    batch_size=batch_size,
                    yield_original_target=yield_original_target):
                predictions.append(prediction)
            assert 3 == len(predictions)
            predictions_0 = predictions[0]
            predictions_1 = predictions[1]

            if yield_original_target:
                assert isinstance(predictions_0, tuple)
                for pred_index, original_data_dict in enumerate(predictions):
                    _, original_data_dict = original_data_dict
                    assert len(data[pred_index]) == len(original_data_dict)
                    for key, value in data[pred_index].items():
                        assert value == original_data_dict[key]
                predictions_0 = predictions_0[0]
                predictions_1 = predictions_1[0]
            assert isinstance(predictions_0, dict)
            assert 6 == len(predictions_1)
            assert 5 == len(predictions_1['tags'])
            assert 9 == len(predictions_1['class_probabilities'])

            correct_text_1 = "Another day at the office"
            correct_tokens_1 = correct_text_1.split()
            assert correct_tokens_1 == predictions_1['words']
            assert correct_text_1 == predictions_1['text']

        # Test that it works on a larger dataset of 150
        larger_dataset = data * 50
        for data_type in [larger_dataset, iter(larger_dataset)]:
            predictions = []
            for prediction in model._predict_iter(
                    data_type,
                    batch_size=batch_size,
                    yield_original_target=yield_original_target):
                predictions.append(prediction)
            assert 150 == len(predictions)
            predictions_0 = predictions[0]
            predictions_1 = predictions[-1]
            predictions_2 = predictions[-2]
            if yield_original_target:
                predictions_0 = predictions_0[0]
                predictions_1 = predictions_1[0]
                predictions_2 = predictions_2[0]
            assert isinstance(predictions_0, dict)
            assert 5 == len(predictions_2['tags'])
            assert 9 == len(predictions_2['class_probabilities'])
            assert 9 == len(predictions_1['tags'])
            assert 9 == len(predictions_1['class_probabilities'])

        # Test the case when you feed it no data which can happen through
        # multiple iterators e.g.
        alt_data = iter(data)
        # ensure alt_data has no data
        assert 3 == len([d for d in alt_data])
        predictions = []
        for prediction in model._predict_iter(
                alt_data,
                batch_size=batch_size,
                yield_original_target=yield_original_target):
            predictions.append(prediction)
        assert not predictions
예제 #10
0
 def test_repr_(self):
     model = AllenNLPModel('ML', self.CONFIG_FILE, 'target-tagger')
     model_repr = model.__repr__()
     assert model_repr == 'ML'
예제 #11
0
    def test_load(self):
        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger')
        # Test the simple case where when no save directory assertion error is
        # raised
        with pytest.raises(AssertionError):
            model.load()
        # Test the case where the save directory attribute exists but does not
        # have a directory with a saved model
        with tempfile.TemporaryDirectory() as tempdir:
            fake_file = Path(tempdir, 'fake file')
            model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger',
                                  fake_file)
            with pytest.raises(FileNotFoundError):
                model.load()
        # The success case
        model_dir = self.TARGET_EXTRACTION_MODEL
        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger',
                              model_dir)
        assert model.model is None

        same_model = model.load()
        assert isinstance(same_model, Model)
        assert model.model is not None
예제 #12
0
train_data, val_data = train_test_split(train_data, test_size=test_size)
train_data = TargetTextCollection(train_data)
val_data = TargetTextCollection(val_data)

datasets = [train_data, val_data, test_data]
tokenizer = spacy_tokenizer()
sizes = []
for dataset in datasets:
    dataset.tokenize(tokenizer)
    dataset.sequence_labels()
    sizes.append(len(dataset))
print(f'Lengths {sizes[0]}, {sizes[1]}, {sizes[2]}')
save_dir = Path('.', 'models', 'glove_model')
param_file = Path('.', 'training_configs', 'Target_Extraction',
                  'General_Domain', 'Glove_LSTM_CRF.jsonnet')
model = AllenNLPModel('Glove', param_file, 'target-tagger', save_dir)

if not save_dir.exists():
    model.fit(train_data, val_data, test_data)
else:
    model.load()
import time
start_time = time.time()
val_iter = iter(val_data.values())
for val_predictions in model.predict_sequences(val_data.values()):
    relevant_val = next(val_iter)
    relevant_val['predicted_sequence_labels'] = val_predictions[
        'sequence_labels']
print(time.time() - start_time)
another_time = time.time()
for val_predictions in model.predict_sequences(val_data.values()):