def test_evaluate_multiple_examples_correct_statistics(): prediction = ["U-PERSON", "O", "O", "U-PERSON", "O", "O"] model = MockTokensModel(prediction=prediction) evaluator = Evaluator(model=model, entities_to_keep=["PERSON"]) input_sample = InputSample("My name is Raphael or David", masked=None, spans=None) input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"] input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"] evaluated = evaluator.evaluate_all( [input_sample, input_sample, input_sample, input_sample]) scores = evaluator.calculate_score(evaluated) assert scores.pii_precision == 0.5 assert scores.pii_recall == 0.5
def test_evaluate_multiple_tokens_no_match_match_correct_statistics(): prediction = ["O", "O", "O", "B-SPACESHIP", "L-SPACESHIP", "O"] model = MockTokensModel(prediction=prediction) evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"]) sample = InputSample("I am the walrus amaericanus magnifico", masked=None, spans=None) sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"] sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] evaluated = evaluator.evaluate_sample(sample, prediction) evaluation = evaluator.calculate_score([evaluated]) assert np.isnan(evaluation.pii_precision) assert evaluation.pii_recall == 0
def test_evaluate_multiple_entities_to_keep_correct_statistics(): prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"] entities_to_keep = ["ANIMAL", "PLANT", "SPACESHIP"] model = MockTokensModel(prediction=prediction) evaluator = Evaluator(model=model, entities_to_keep=entities_to_keep) sample = InputSample(full_text="I dog the walrus", masked="I [ANIMAL] the [ANIMAL]", spans=None) sample.tokens = ["I", "am", "the", "walrus"] sample.tags = ["O", "O", "O", "U-ANIMAL"] evaluation_result = evaluator.evaluate_sample(sample, prediction) assert evaluation_result.results[("O", "O")] == 2 assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1 assert evaluation_result.results[("O", "ANIMAL")] == 1
def test_evaluator_simple(): prediction = ["O", "O", "O", "U-ANIMAL"] model = MockTokensModel(prediction=prediction, entities_to_keep=["ANIMAL"]) evaluator = Evaluator(model=model) sample = InputSample(full_text="I am the walrus", masked="I am the [ANIMAL]", spans=None) sample.tokens = ["I", "am", "the", "walrus"] sample.tags = ["O", "O", "O", "U-ANIMAL"] evaluated = evaluator.evaluate_sample(sample, prediction) final_evaluation = evaluator.calculate_score([evaluated]) assert final_evaluation.pii_precision == 1 assert final_evaluation.pii_recall == 1
def test_analyzer_simple_input(): model = PresidioAnalyzerWrapper(entities_to_keep=["PERSON"]) sample = InputSample( full_text="My name is Mike", masked="My name is [PERSON]", spans=[Span("PERSON", "Mike", 10, 14)], create_tags_from_span=True, ) prediction = model.predict(sample) evaluator = Evaluator(model=model) evaluated = evaluator.evaluate_sample(sample, prediction) metrics = evaluator.calculate_score([evaluated]) assert metrics.pii_precision == 1 assert metrics.pii_recall == 1
def test_align_entity_types_wrong_mapping_exception(): sample1 = InputSample( "I live in ABC", spans=[ Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101) ], create_tags_from_span=False, ) entities_mapping = {"Z": "z"} with pytest.raises(ValueError): Evaluator.align_entity_types(input_samples=[sample1], entities_mapping=entities_mapping)
def test_spacy_simple(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset( os.path.join(dir_path, "data/generated_small.txt")) spacy_model = SpacyModel(model_name="en_core_web_lg", entities_to_keep=['PERSON']) evaluator = Evaluator(model=spacy_model) evaluation_results = evaluator.evaluate_all(input_samples) scores = evaluator.calculate_score(evaluation_results) np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON']) np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON']) assert scores.pii_recall > 0 assert scores.pii_precision > 0
def test_dataset_to_metric_50_50_model(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json( "{}/data/generated_small.json".format(dir_path), length=100) # Replace 50% of the predictions with a list of "O" model = FiftyFiftyIdentityTokensMockModel() evaluator = Evaluator(model=model, entities_to_keep=["PERSON"]) evaluation_results = evaluator.evaluate_all(input_samples) metrics = evaluator.calculate_score(evaluation_results) print(metrics.pii_precision) print(metrics.pii_recall) print(metrics.pii_f) assert metrics.pii_precision == 1 assert metrics.pii_recall < 0.75 assert metrics.pii_recall > 0.25
def test_flair_simple(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset( os.path.join(dir_path, "data/generated_small.txt")) model = SequenceTagger.load("ner-ontonotes-fast") # .load('ner') flair_model = FlairModel(model=model, entities_to_keep=["PERSON"]) evaluator = Evaluator(model=flair_model) evaluation_results = evaluator.evaluate_all(input_samples) scores = evaluator.calculate_score(evaluation_results) np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict["PERSON"]) np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict["PERSON"]) assert scores.pii_recall > 0 assert scores.pii_precision > 0
def test_test_crf_simple(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset( os.path.join(dir_path, "data/generated_small.txt")) model_path = os.path.abspath( os.path.join(dir_path, "..", "model-outputs/crf.pickle")) crf_model = CRFModel(model_pickle_path=model_path, entities_to_keep=['PERSON']) evaluator = Evaluator(model=crf_model) evaluation_results = evaluator.evaluate_all(input_samples) scores = evaluator.calculate_score(evaluation_results) np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON']) np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON']) assert scores.pii_recall > 0 assert scores.pii_precision > 0
def score_presidio_recognizer( recognizer: EntityRecognizer, entities_to_keep: List[str], input_samples: Optional[List[InputSample]] = None, labeling_scheme: str = "BILUO", with_nlp_artifacts: bool = False, verbose: bool = False, ) -> EvaluationResult: """ Run data through one EntityRecognizer and gather results and stats """ if not input_samples: print("Reading dataset") input_samples = InputSample.read_dataset_json( "../../data/synth_dataset_v2.json") else: input_samples = list(input_samples) print( "Preparing dataset by aligning entity names to Presidio's entity names" ) updated_samples = Evaluator.align_entity_types( input_samples, entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map) model = PresidioRecognizerWrapper( recognizer=recognizer, entities_to_keep=entities_to_keep, labeling_scheme=labeling_scheme, nlp_engine=SpacyNlpEngine(), with_nlp_artifacts=with_nlp_artifacts, ) return score_model( model=model, entities_to_keep=entities_to_keep, input_samples=updated_samples, verbose=verbose, )
def score_presidio_analyzer( input_samples: Optional[List[InputSample]] = None, entities_to_keep: Optional[List[str]] = None, labeling_scheme: str = "BILUO", verbose: bool = True, ) -> EvaluationResult: """""" if not input_samples: print("Reading dataset") input_samples = read_synth_dataset("../../data/synth_dataset.txt") else: input_samples = list(input_samples) print( "Preparing dataset by aligning entity names to Presidio's entity names" ) updated_samples = Evaluator.align_entity_types(input_samples) flatten = lambda l: [item for sublist in l for item in sublist] from collections import Counter count_per_entity = Counter([ span.entity_type for span in flatten( [input_sample.spans for input_sample in updated_samples]) ]) if verbose: print("Count per entity:") print(count_per_entity) analyzer = PresidioAnalyzerWrapper(entities_to_keep=entities_to_keep, labeling_scheme=labeling_scheme) return score_model( model=analyzer, entities_to_keep=list(count_per_entity.keys()), input_samples=updated_samples, verbose=verbose, )
def test_align_entity_types_correct_output(): sample1 = InputSample( "I live in ABC", spans=[ Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101) ], create_tags_from_span=False, ) sample2 = InputSample( "I live in ABC", spans=[ Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("C", "c", 100, 101) ], create_tags_from_span=False, ) samples = [sample1, sample2] mapping = { "A": "1", "B": "2", "C": "1", } new_samples = Evaluator.align_entity_types(samples, mapping) count_per_entity = Counter() for sample in new_samples: for span in sample.spans: count_per_entity[span.entity_type] += 1 assert count_per_entity["1"] == 5 assert count_per_entity["2"] == 1