예제 #1
0
    def test_target_extraction_fit(self, test_data: bool):

        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger')
        assert model.model is None

        train_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)
        val_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)

        tokens_in_vocab = [
            'at', 'case', 'was', 'the', 'day', 'great', 'cover', 'office',
            'another', 'and', 'rubbish', 'laptop', '@@PADDING@@', '@@UNKNOWN@@'
        ]
        if test_data:
            tokens_in_vocab = tokens_in_vocab + ['better']
            test_data = TargetTextCollection.load_json(
                self.TARGET_EXTRACTION_TEST_DATA)
            model.fit(train_data, val_data, test_data)
        else:
            model.fit(train_data, val_data)

        token_index = model.model.vocab.get_token_to_index_vocabulary('tokens')
        assert len(token_index) == len(tokens_in_vocab)
        for token in tokens_in_vocab:
            assert token in token_index

        # Check attributes have changed.
        assert model.model is not None
        assert isinstance(model.model, Model)

        # Check that it will save to a directory of our choosing
        with tempfile.TemporaryDirectory() as save_dir:
            saved_model_fp = Path(save_dir, 'model.tar.gz')
            assert not saved_model_fp.exists()
            model = AllenNLPModel('TE',
                                  self.CONFIG_FILE,
                                  'target-tagger',
                                  save_dir=save_dir)
            model.fit(train_data, val_data)
            assert saved_model_fp.exists()
예제 #2
0
                for error in returned_errors:
                    error_id = error['text_id']
                    del dataset[error_id]
            returned_errors = dataset.sequence_labels(return_errors=True)
            if returned_errors:
                raise ValueError('Sequence label errors are still persisting')
            sizes.append(len(dataset))
            dataset: TargetTextCollection
            target_sizes.append(dataset.number_targets())
        print(
            f'Lengths Train: {sizes[0]}, Validation: {sizes[1]}, Test: {sizes[2]}'
        )
        print(f'Number of targets, Train: {target_sizes[0]}, Validation: '
              f'{target_sizes[1]}, Test: {target_sizes[2]}')
        print('Fitting model')
        model.fit(train_data, val_data, test_data)
        print('Finished fitting model\nNow Evaluating model:')
    else:
        test_data.tokenize(spacy_tokenizer())
        device = -1
        if args.cuda:
            device = 0
        model.load(cuda_device=device)
        print('Finished loading model\nNow Evaluating model:')

    for data in test_data.values():
        data['tokens'] = data['tokenized_text']
    test_iter = iter(test_data.values())
    for test_pred in model.predict_sequences(test_data.values(),
                                             batch_size=args.batch_size):
        relevant_test = next(test_iter)