def test_credit_card_recognizer_with_template(pii_csv, utterances, num_of_examples, acceptance_threshold): """ Test credit card recognizer with a dataset generated from template and a CSV values file :param pii_csv: input csv file location :param utterances: template file location :param num_of_examples: number of samples to be used from dataset to test :param acceptance_threshold: minimum precision/recall allowed for tests to pass """ # read template and CSV files import os dir_path = os.path.dirname(os.path.realpath(__file__)) # generate examples generator = PresidioDataGenerator() templates = utterances.format(dir_path) examples = generator.generate_fake_data(templates=templates, n_samples=num_of_examples) input_samples = [ InputSample.from_faker_spans_result(example) for example in examples ] scores = score_presidio_recognizer( recognizer=CreditCardRecognizer(), entities_to_keep=["CREDIT_CARD"], input_samples=input_samples, ) if not np.isnan(scores.pii_f): assert acceptance_threshold <= scores.pii_f
def test_faker_spans_result_to_input_sample(faker_span_result): input_sample = InputSample.from_faker_spans_result( faker_span_result, create_tags_from_span=False) assert input_sample.full_text == "Dan is my name." assert input_sample.masked == "{{name}} is my name." assert input_sample.spans[0] == Span("name", "Dan", 0, 3) assert input_sample.spans[0] == Span("name", "Dan", 0, 3)
def test_faker_spans_to_input_sample_with_tags(faker_span_result): input_sample = InputSample.from_faker_spans_result( faker_span_result, create_tags_from_span=True, scheme="BILUO") assert input_sample.tags assert input_sample.tokens assert any(["U-name" in tag for tag in input_sample.tags])
def test_pattern_recognizer( pii_csv, ext_csv, utterances, entity_name, pattern, score, num_of_examples, acceptance_threshold, max_mistakes_number, ): """ Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities and another CSV values file with a custom entity :param pii_csv: input csv file location with the common entities :param ext_csv: input csv file location with custom entities :param utterances: template file location :param entity_name: custom entity name :param pattern: recognizer pattern :param num_of_examples: number of samples to be used from dataset to test :param acceptance_threshold: minimum precision/recall allowed for tests to pass """ import os dir_path = os.path.dirname(os.path.realpath(__file__)) dfpii = pd.read_csv(pii_csv.format(dir_path), encoding="utf-8") dfext = pd.read_csv(ext_csv.format(dir_path), encoding="utf-8") ext_column_name = dfext.columns[0] def get_from_ext(i): index = i % dfext.shape[0] return dfext.iat[index, 0] # extend pii with ext data dfpii[ext_column_name] = [ get_from_ext(i) for i in range(0, dfpii.shape[0]) ] # generate examples generator = PresidioDataGenerator() templates = utterances.format(dir_path) examples = generator.generate_fake_data(templates=templates, n_samples=num_of_examples) input_samples = [ InputSample.from_faker_spans_result(example) for example in examples ] pattern = Pattern("test pattern", pattern, score) pattern_recognizer = PatternRecognizer(entity_name, name="test recognizer", patterns=[pattern]) scores = score_presidio_recognizer( recognizer=pattern_recognizer, entities_to_keep=[entity_name], input_samples=input_samples, ) if not np.isnan(scores.pii_f): assert acceptance_threshold <= scores.pii_f assert max_mistakes_number >= len(scores.model_errors)