def initialize(self): SpacyRecognizer.ENTITIES = ["PERSON"] Replace.NEW_VALUE = 'replace_text' nlp_engine = SpacyNlpEngine() nlp_engine.nlp['en'] = spacy.load('en_core_web_lg', disable=["parser", "tagger", "lemmatizer"]) self.analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine) self.anonymizer_engine = AnonymizerEngine()
def __init__(self, registry=None, nlp_engine=None, app_tracer=None, enable_trace_pii=False, default_score_threshold=None, use_recognizer_store=False): """ AnalyzerEngine class: Orchestrating the detection of PII entities and all related logic :param registry: instance of type RecognizerRegistry :param nlp_engine: instance of type NlpEngine (for example SpacyNlpEngine) :param app_tracer: instance of type AppTracer, used to trace the logic used during each request :param enable_trace_pii: bool, defines whether PII values should be traced or not. :param default_score_threshold: Minimum confidence value for detected entities to be returned :param use_recognizer_store Whether to call the Presidio Recognizer Store on every request to gather responses from custom recognizers as well (only applicable for the full Presidio service) """ if not nlp_engine: logger.info("nlp_engine not provided. Creating new " "SpacyNlpEngine instance") from presidio_analyzer.nlp_engine import SpacyNlpEngine nlp_engine = SpacyNlpEngine() if not registry: logger.info("Recognizer registry not provided. " "Creating default RecognizerRegistry instance") from presidio_analyzer import RecognizerRegistry if use_recognizer_store: recognizer_store_api = RecognizerStoreApi() else: recognizer_store_api = None registry = RecognizerRegistry( recognizer_store_api=recognizer_store_api) if not app_tracer: app_tracer = AppTracer() # load nlp module self.nlp_engine = nlp_engine # prepare registry self.registry = registry # load all recognizers if not registry.recognizers: registry.load_predefined_recognizers() self.app_tracer = app_tracer self.enable_trace_pii = enable_trace_pii if default_score_threshold is None: self.default_score_threshold = 0 else: self.default_score_threshold = default_score_threshold
def score_presidio_recognizer( recognizer, entities_to_keep, input_samples, withNlpArtifacts=False ) -> EvaluationResult: model = PresidioRecognizerEvaluator( recognizer=recognizer, entities_to_keep=entities_to_keep, nlp_engine=SpacyNlpEngine(), with_nlp_artifacts=withNlpArtifacts, ) evaluated_samples = model.evaluate_all(input_samples[:]) evaluation_result = model.calculate_score(evaluated_samples, beta=2.5) evaluation_result.print() if math.isnan(evaluation_result.pii_precision): evaluation_result.pii_precision = 0 return evaluation_result
def score_presidio_recognizer( recognizer: EntityRecognizer, entities_to_keep: List[str], input_samples: Optional[List[InputSample]] = None, labeling_scheme: str = "BILUO", with_nlp_artifacts: bool = False, verbose: bool = False, ) -> EvaluationResult: """ Run data through one EntityRecognizer and gather results and stats """ if not input_samples: print("Reading dataset") input_samples = InputSample.read_dataset_json( "../../data/synth_dataset_v2.json") else: input_samples = list(input_samples) print( "Preparing dataset by aligning entity names to Presidio's entity names" ) updated_samples = Evaluator.align_entity_types( input_samples, entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map) model = PresidioRecognizerWrapper( recognizer=recognizer, entities_to_keep=entities_to_keep, labeling_scheme=labeling_scheme, nlp_engine=SpacyNlpEngine(), with_nlp_artifacts=with_nlp_artifacts, ) return score_model( model=model, entities_to_keep=entities_to_keep, input_samples=updated_samples, verbose=verbose, )
def serve_command_handler(enable_trace_pii, env_grpc_port=False, grpc_port=3000): logger.info("Starting GRPC server") server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) logger.info("GRPC started") logger.info("Creating RecognizerRegistry") registry = RecognizerRegistry() logger.info("RecognizerRegistry created") logger.info("Creating SpacyNlpEngine") nlp_engine = SpacyNlpEngine() logger.info("SpacyNlpEngine created") analyze_pb2_grpc.add_AnalyzeServiceServicer_to_server( AnalyzerEngine(registry=registry, nlp_engine=nlp_engine, enable_trace_pii=enable_trace_pii, use_recognizer_store=True), server) logger.info("Added AnalyzeServiceServicer to server") if env_grpc_port: logger.info("Getting port {}".format(env_grpc_port)) port = os.environ.get('GRPC_PORT') if port is not None or port != '': grpc_port = int(port) else: logger.info("env_grpc_port not provided. " "Using grpc_port {}".format(grpc_port)) server.add_insecure_port('[::]:' + str(grpc_port)) logger.info("Starting GRPC listener at port {}".format(grpc_port)) server.start() try: while True: time.sleep(1) except KeyboardInterrupt: server.stop(0)
import os import sys from presidio_analyzer.nlp_engine import SpacyNlpEngine sys.path.append(os.path.dirname(os.path.dirname( os.path.abspath(__file__))) + "/tests") from .assertions import assert_result, assert_result_within_score_range print("Creating tests SpacyNlpEngine which starts the spaCy model") TESTS_NLP_ENGINE = SpacyNlpEngine()