예제 #1
0
 def test_predict_sequences(self, batch_size: Optional[int]):
     data = [{
         "text": "The laptop case was great and cover was rubbish"
     }, {
         "text": "Another day at the office"
     }, {
         "text": "The laptop case was great and cover was rubbish"
     }]
     answers = [{
         "sequence_labels": ['O', 'B', 'B', 'O', 'O', 'B', 'O', 'O', 'B'],
         "confidence": [0, 1, 2, 3, 4, 5, 6, 7, 8],
         "text":
         "The laptop case was great and cover was rubbish",
         "tokens":
         "The laptop case was great and cover was rubbish".split()
     }, {
         "sequence_labels": ['O', 'B', 'B', 'O', 'B'],
         "confidence": [0, 1, 2, 3, 4],
         "text": "Another day at the office",
         "tokens": "Another day at the office".split()
     }, {
         "sequence_labels": ['O', 'B', 'B', 'O', 'O', 'B', 'O', 'O', 'B'],
         "confidence": [0, 1, 2, 3, 4, 5, 6, 7, 8],
         "text":
         "The laptop case was great and cover was rubbish",
         "tokens":
         "The laptop case was great and cover was rubbish".split()
     }]
     # Requires the softmax rather than the CRF version as we want the
     # confidence scores that are returned to be greater than
     # 1 / number labels where as in the CRF case it maximses entire
     # sentence level predictions thus the confidence returned can be less
     # than 1 / number labels
     model_dir = self.TARGET_EXTRACTION_SF_MODEL
     model = AllenNLPModel('TE', self.SOFTMAX_CONFIG_FILE, 'target-tagger',
                           model_dir)
     model.load()
     predictions = []
     for index, prediction in enumerate(
             model.predict_sequences(data, batch_size)):
         predictions.append(prediction)
         answer = answers[index]
         assert 4 == len(prediction)
         for key, value in answer.items():
             assert len(value) == len(prediction[key])
             if key != 'confidence':
                 assert value == prediction[key]
             else:
                 for confidence_score in prediction[key]:
                     assert 0.333333 < confidence_score
                     assert 1 > confidence_score
예제 #2
0
              f'{target_sizes[1]}, Test: {target_sizes[2]}')
        print('Fitting model')
        model.fit(train_data, val_data, test_data)
        print('Finished fitting model\nNow Evaluating model:')
    else:
        test_data.tokenize(spacy_tokenizer())
        device = -1
        if args.cuda:
            device = 0
        model.load(cuda_device=device)
        print('Finished loading model\nNow Evaluating model:')

    for data in test_data.values():
        data['tokens'] = data['tokenized_text']
    test_iter = iter(test_data.values())
    for test_pred in model.predict_sequences(test_data.values(),
                                             batch_size=args.batch_size):
        relevant_test = next(test_iter)
        relevant_test['predicted_sequence_labels'] = test_pred[
            'sequence_labels']
    test_scores = test_data.exact_match_score('predicted_sequence_labels')
    print(f'Test F1 scores: {test_scores[2]}')

    first = True
    data_fp = args.data_fp
    from time import time
    t = time()
    if args.number_to_predict_on:
        data_count = 0
        with data_fp.open('r') as data_file:
            for line in data_file:
                data_count += 1
예제 #3
0
    dataset.sequence_labels()
    sizes.append(len(dataset))
print(f'Lengths {sizes[0]}, {sizes[1]}, {sizes[2]}')
save_dir = Path('.', 'models', 'glove_model')
param_file = Path('.', 'training_configs', 'Target_Extraction',
                  'General_Domain', 'Glove_LSTM_CRF.jsonnet')
model = AllenNLPModel('Glove', param_file, 'target-tagger', save_dir)

if not save_dir.exists():
    model.fit(train_data, val_data, test_data)
else:
    model.load()
import time
start_time = time.time()
val_iter = iter(val_data.values())
for val_predictions in model.predict_sequences(val_data.values()):
    relevant_val = next(val_iter)
    relevant_val['predicted_sequence_labels'] = val_predictions[
        'sequence_labels']
print(time.time() - start_time)
another_time = time.time()
for val_predictions in model.predict_sequences(val_data.values()):
    pass
print(time.time() - another_time)
print('done')
print(val_data.exact_match_score('predicted_sequence_labels')[2])
test_iter = iter(test_data.values())
for test_pred in model.predict_sequences(test_data.values()):
    relevant_test = next(test_iter)
    relevant_test['predicted_sequence_labels'] = test_pred['sequence_labels']
print(test_data.exact_match_score('predicted_sequence_labels')[2])