Exemplo n.º 1
0
 def evaluate(self,
              file_name,
              line,
              original_sequence,
              corrupt_sequence,
              predicted_sequence,
              ground_truth_corruptions=None,
              probability_predicted_corruption_pairs=None,
              runtime=0,
              evaluate_ed=True):
     # ground truth corruptions
     if ground_truth_corruptions is None:
         ground_truth_corruptions = get_space_corruptions(original_sequence, corrupt_sequence)
     # predicted corruptions
     if probability_predicted_corruption_pairs is None:
         predicted_corruptions = get_space_corruptions(predicted_sequence, corrupt_sequence,
                                                       ignore_other_insertions=False)
         probability_predicted_corruption_pairs = [(1, prediction) for prediction in predicted_corruptions]
     # edit distance
     if evaluate_ed:
         ed_before = levenshtein(original_sequence, corrupt_sequence, substitutions=False)
         ed_after = levenshtein(original_sequence, predicted_sequence, substitutions=False)
     else:
         ed_before = ed_after = 0
     # prediction probability dictionary
     probabilities = {prediction: probability for probability, prediction in probability_predicted_corruption_pairs}
     predicted_corruptions = probabilities.keys()
     # tp, fp and fn sets
     tp_insertions, fp_insertions, fn_insertions = tp_fp_fn_by_type(ground_truth_corruptions, predicted_corruptions,
                                                                    CorruptionType.INSERTION)
     tp_deletions, fp_deletions, fn_deletions = tp_fp_fn_by_type(ground_truth_corruptions, predicted_corruptions,
                                                                 CorruptionType.DELETION)
     # register sequence result
     sequence_result = SequenceResult(
         file_name=file_name,
         line=line,
         original_sequence=original_sequence,
         corrupt_sequence=corrupt_sequence,
         predicted_sequence=predicted_sequence,
         ground_truth_corruptions=ground_truth_corruptions,
         predicted_corruption_probabilities=probabilities,
         tp_insertions=tp_insertions,
         fp_insertions=fp_insertions,
         fn_insertions=fn_insertions,
         tp_deletions=tp_deletions,
         fp_deletions=fp_deletions,
         fn_deletions=fn_deletions,
         ed_before=ed_before,
         ed_after=ed_after,
         runtime=runtime)
     self.sequence_results.append(sequence_result)
     return sequence_result
 def add_predictions(self, key: str, predicted_sequence: str):
     self.predicted_sequences[key] = predicted_sequence
     predicted_corruptions = get_space_corruptions(predicted_sequence, self.corrupt_sequence)
     for predicted_corruption in predicted_corruptions:
         if predicted_corruption not in self.predictions:
             self.predictions[predicted_corruption] = Prediction(ground_truth=False)
         self.predictions[predicted_corruption].add_predictor_key(key)
 def __init__(self,
              correct_sequence: str,
              corrupt_sequence: str):
     self.correct_sequence = correct_sequence
     self.corrupt_sequence = corrupt_sequence
     self.ground_truth_corruptions = get_space_corruptions(correct_sequence, corrupt_sequence)
     self.predictions = {corruption: Prediction(ground_truth=True) for corruption in self.ground_truth_corruptions}
     self.predicted_sequences = {}
 def test_get_space_corruptions(self):
     original = "This is a test sequence. "
     corrupt = " This isa test seq uence."
     expected_corruptions = [
         Corruption(CorruptionType.INSERTION, 0, ' '),
         Corruption(CorruptionType.DELETION, 8, ' '),
         Corruption(CorruptionType.INSERTION, 18, ' '),
         Corruption(CorruptionType.DELETION, 25, ' ')
     ]
     corruptions = get_space_corruptions(original, corrupt)
     self.assertEqual(expected_corruptions, corruptions)
    def predict(self, sequence):
        q = CandidateQueue(self.corrector,
                           self.score,
                           tolerance_steps=self.tolerance_steps)
        log_likelihood = q.get_log_likelihood(sequence)
        q.add_candidate(sequence, log_likelihood)

        while not q.terminated():
            candidate = q.pop()
            log_likelihood = q.get_log_likelihood(candidate)
            print(log_likelihood, candidate)
            if self.spelling:
                candidates = q.get_candidates_spelling(candidate)
            else:
                candidates = q.get_candidates(candidate)
            for score, candidate in candidates:
                q.add_candidate(candidate, score)

        predictions = get_space_corruptions(
            q.best_sequence, sequence) if not self.spelling else [
            ]  # TODO spelling predictions
        dummy_probs = [1 for _ in predictions]
        print(q.best_sequence)
        return zip(dummy_probs, predictions), q.best_sequence
Exemplo n.º 6
0
if __name__ == "__main__":
    exclude_zero = "no-zero" in sys.argv

    out_folder = "acl_error_distribution/"

    absolute_values = []
    error_rates = []

    for subset in (Subset.DEVELOPMENT, Subset.TEST):
        benchmark = Benchmark("ACL", subset)
        for correct, corrupt in benchmark.get_sequence_pairs(
                BenchmarkFiles.CORRUPT):
            print(corrupt)
            print(correct)
            edits = get_space_corruptions(correct, corrupt)
            n_edits = len(edits)
            n_chars = len(correct)
            ratio = n_edits / n_chars
            absolute_values.append(n_edits)
            error_rates.append(ratio)

    save_histogram_data(error_rates,
                        out_folder + "tokenization_character_error_rates.txt")
    plot_rate_histogram(
        error_rates,
        title="Tokenization character error rates",
        subtitle="ACL development+test",
        xlabel=
        "Tokenization character error rate (whitespace errors / characters)",
        save_path=out_folder +
 def get_ground_truth(self):
     return set(get_space_corruptions(self.correct, self.corrupt))