def create_flair_corpus(self, train_samples_path, test_samples_path, val_samples_path): """ Create a flair Corpus object and saive it to train, test, validation files. :param train_samples_path: Path to train samples :param test_samples_path: Path to test samples :param val_samples_path: Path to validation samples :return: """ if not path.exists("flair_train.txt"): train_samples = InputSample.read_dataset_json(train_samples_path) train_tagged = [ sample for sample in train_samples if len(sample.spans) > 0 ] print( f"Kept {len(train_tagged)} train samples after removal of non-tagged samples" ) train_data = InputSample.create_conll_dataset(train_tagged) self.to_flair(train_data, outfile="flair_train.txt") if not path.exists("flair_test.txt"): test_samples = InputSample.read_dataset_json(test_samples_path) test_data = InputSample.create_conll_dataset(test_samples) self.to_flair(test_data, outfile="flair_test.txt") if not path.exists("flair_val.txt"): val_samples = InputSample.read_dataset_json(val_samples_path) val_data = InputSample.create_conll_dataset(val_samples) self.to_flair(val_data, outfile="flair_val.txt")
def test_analyzer_with_generated_text(test_input, acceptance_threshold): """ Test analyzer with a generated dataset text file :param test_input: input text file location :param acceptance_threshold: minimum precision/recall allowed for tests to pass """ # read test input from generated file import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json(test_input.format(dir_path)) updated_samples = Evaluator.align_entity_types( input_samples=input_samples, entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map, ) analyzer = PresidioAnalyzerWrapper() evaluator = Evaluator(model=analyzer) evaluated_samples = evaluator.evaluate_all(updated_samples) scores = evaluator.calculate_score(evaluation_results=evaluated_samples) assert acceptance_threshold <= scores.pii_precision assert acceptance_threshold <= scores.pii_recall
def test_to_conll(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json( os.path.join(dir_path, "data/generated_small.json")) conll = InputSample.create_conll_dataset(input_samples) sentences = conll["sentence"].unique() assert len(sentences) == len(input_samples)
def test_dataset_to_metric_identity_model(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json( "{}/data/generated_small.json".format(dir_path), length=10) model = IdentityTokensMockModel() evaluator = Evaluator(model=model) evaluation_results = evaluator.evaluate_all(input_samples) metrics = evaluator.calculate_score(evaluation_results) assert metrics.pii_precision == 1 assert metrics.pii_recall == 1
def test_spacy_recognizer_with_generated_text(test_input, acceptance_threshold): """ Test spacy recognizer with a generated dataset text file :param test_input: input text file location :param acceptance_threshold: minimim precision/recall allowed for tests to pass """ # read test input from generated file import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json(test_input.format(dir_path)) scores = score_presidio_recognizer( SpacyRecognizer(), ["PERSON"], input_samples, with_nlp_artifacts=True ) assert acceptance_threshold <= scores.pii_f
def test_spacy_simple(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json( os.path.join(dir_path, "data/generated_small.json")) spacy_model = SpacyModel(model_name="en_core_web_sm", entities_to_keep=["PERSON"]) evaluator = Evaluator(model=spacy_model) evaluation_results = evaluator.evaluate_all(input_samples) scores = evaluator.calculate_score(evaluation_results) np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict["PERSON"]) np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict["PERSON"]) assert scores.pii_recall > 0 assert scores.pii_precision > 0
def test_credit_card_recognizer_with_generated_text(test_input, acceptance_threshold): """ Test credit card recognizer with a generated dataset text file :param test_input: input text file location :param acceptance_threshold: minimum precision/recall allowed for tests to pass """ # read test input from generated file import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json(test_input.format(dir_path)) scores = score_presidio_recognizer( recognizer=CreditCardRecognizer(), entities_to_keep=["CREDIT_CARD"], input_samples=input_samples, ) assert acceptance_threshold <= scores.pii_f
def test_dataset_to_metric_50_50_model(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json( "{}/data/generated_small.json".format(dir_path), length=100) # Replace 50% of the predictions with a list of "O" model = FiftyFiftyIdentityTokensMockModel() evaluator = Evaluator(model=model, entities_to_keep=["PERSON"]) evaluation_results = evaluator.evaluate_all(input_samples) metrics = evaluator.calculate_score(evaluation_results) print(metrics.pii_precision) print(metrics.pii_recall) print(metrics.pii_f) assert metrics.pii_precision == 1 assert metrics.pii_recall < 0.75 assert metrics.pii_recall > 0.25
def score_presidio_recognizer( recognizer: EntityRecognizer, entities_to_keep: List[str], input_samples: Optional[List[InputSample]] = None, labeling_scheme: str = "BILUO", with_nlp_artifacts: bool = False, verbose: bool = False, ) -> EvaluationResult: """ Run data through one EntityRecognizer and gather results and stats """ if not input_samples: print("Reading dataset") input_samples = InputSample.read_dataset_json( "../../data/synth_dataset_v2.json") else: input_samples = list(input_samples) print( "Preparing dataset by aligning entity names to Presidio's entity names" ) updated_samples = Evaluator.align_entity_types( input_samples, entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map) model = PresidioRecognizerWrapper( recognizer=recognizer, entities_to_keep=entities_to_keep, labeling_scheme=labeling_scheme, nlp_engine=SpacyNlpEngine(), with_nlp_artifacts=with_nlp_artifacts, ) return score_model( model=model, entities_to_keep=entities_to_keep, input_samples=updated_samples, verbose=verbose, )
def small_dataset(): dir_path = Path(__file__).parent input_samples = InputSample.read_dataset_json( Path(dir_path, "data", "generated_small.json")) return input_samples
def small_dataset() -> List[InputSample]: dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json( os.path.join(dir_path, "data/generated_small.json")) return input_samples