예제 #1
0
    def initialize(self):
        SpacyRecognizer.ENTITIES = ["PERSON"]
        Replace.NEW_VALUE = 'replace_text'
        nlp_engine = SpacyNlpEngine()
        nlp_engine.nlp['en'] = spacy.load('en_core_web_lg', disable=["parser", "tagger", "lemmatizer"])

        self.analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)
        self.anonymizer_engine = AnonymizerEngine()
예제 #2
0
    def __init__(self,
                 registry=None,
                 nlp_engine=None,
                 app_tracer=None,
                 enable_trace_pii=False,
                 default_score_threshold=None,
                 use_recognizer_store=False):
        """
        AnalyzerEngine class: Orchestrating the detection of PII entities
        and all related logic
        :param registry: instance of type RecognizerRegistry
        :param nlp_engine: instance of type NlpEngine
        (for example SpacyNlpEngine)
        :param app_tracer: instance of type AppTracer,
        used to trace the logic used during each request
        :param enable_trace_pii: bool,
        defines whether PII values should be traced or not.
        :param default_score_threshold: Minimum confidence value
        for detected entities to be returned
        :param use_recognizer_store Whether to call the
        Presidio Recognizer Store on every request to gather
        responses from custom recognizers as well
        (only applicable for the full Presidio service)
        """
        if not nlp_engine:
            logger.info("nlp_engine not provided. Creating new "
                        "SpacyNlpEngine instance")
            from presidio_analyzer.nlp_engine import SpacyNlpEngine
            nlp_engine = SpacyNlpEngine()
        if not registry:
            logger.info("Recognizer registry not provided. "
                        "Creating default RecognizerRegistry instance")
            from presidio_analyzer import RecognizerRegistry
            if use_recognizer_store:
                recognizer_store_api = RecognizerStoreApi()
            else:
                recognizer_store_api = None
            registry = RecognizerRegistry(
                recognizer_store_api=recognizer_store_api)
        if not app_tracer:
            app_tracer = AppTracer()

        # load nlp module
        self.nlp_engine = nlp_engine
        # prepare registry
        self.registry = registry
        # load all recognizers
        if not registry.recognizers:
            registry.load_predefined_recognizers()

        self.app_tracer = app_tracer
        self.enable_trace_pii = enable_trace_pii

        if default_score_threshold is None:
            self.default_score_threshold = 0
        else:
            self.default_score_threshold = default_score_threshold
def score_presidio_recognizer(
    recognizer, entities_to_keep, input_samples, withNlpArtifacts=False
) -> EvaluationResult:
    model = PresidioRecognizerEvaluator(
        recognizer=recognizer,
        entities_to_keep=entities_to_keep,
        nlp_engine=SpacyNlpEngine(),
        with_nlp_artifacts=withNlpArtifacts,
    )
    evaluated_samples = model.evaluate_all(input_samples[:])
    evaluation_result = model.calculate_score(evaluated_samples, beta=2.5)
    evaluation_result.print()
    if math.isnan(evaluation_result.pii_precision):
        evaluation_result.pii_precision = 0
    return evaluation_result
예제 #4
0
def score_presidio_recognizer(
    recognizer: EntityRecognizer,
    entities_to_keep: List[str],
    input_samples: Optional[List[InputSample]] = None,
    labeling_scheme: str = "BILUO",
    with_nlp_artifacts: bool = False,
    verbose: bool = False,
) -> EvaluationResult:
    """
    Run data through one EntityRecognizer and gather results and stats
    """

    if not input_samples:
        print("Reading dataset")
        input_samples = InputSample.read_dataset_json(
            "../../data/synth_dataset_v2.json")
    else:
        input_samples = list(input_samples)

    print(
        "Preparing dataset by aligning entity names to Presidio's entity names"
    )

    updated_samples = Evaluator.align_entity_types(
        input_samples,
        entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map)

    model = PresidioRecognizerWrapper(
        recognizer=recognizer,
        entities_to_keep=entities_to_keep,
        labeling_scheme=labeling_scheme,
        nlp_engine=SpacyNlpEngine(),
        with_nlp_artifacts=with_nlp_artifacts,
    )
    return score_model(
        model=model,
        entities_to_keep=entities_to_keep,
        input_samples=updated_samples,
        verbose=verbose,
    )
예제 #5
0
def serve_command_handler(enable_trace_pii,
                          env_grpc_port=False,
                          grpc_port=3000):
    logger.info("Starting GRPC server")
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
    logger.info("GRPC started")

    logger.info("Creating RecognizerRegistry")
    registry = RecognizerRegistry()
    logger.info("RecognizerRegistry created")
    logger.info("Creating SpacyNlpEngine")
    nlp_engine = SpacyNlpEngine()
    logger.info("SpacyNlpEngine created")

    analyze_pb2_grpc.add_AnalyzeServiceServicer_to_server(
        AnalyzerEngine(registry=registry,
                       nlp_engine=nlp_engine,
                       enable_trace_pii=enable_trace_pii,
                       use_recognizer_store=True), server)

    logger.info("Added AnalyzeServiceServicer to server")

    if env_grpc_port:
        logger.info("Getting port {}".format(env_grpc_port))
        port = os.environ.get('GRPC_PORT')
        if port is not None or port != '':
            grpc_port = int(port)
    else:
        logger.info("env_grpc_port not provided. "
                    "Using grpc_port {}".format(grpc_port))

    server.add_insecure_port('[::]:' + str(grpc_port))
    logger.info("Starting GRPC listener at port {}".format(grpc_port))
    server.start()
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        server.stop(0)
예제 #6
0
import os
import sys

from presidio_analyzer.nlp_engine import SpacyNlpEngine

sys.path.append(os.path.dirname(os.path.dirname(
    os.path.abspath(__file__))) + "/tests")

from .assertions import assert_result, assert_result_within_score_range

print("Creating tests SpacyNlpEngine which starts the spaCy model")
TESTS_NLP_ENGINE = SpacyNlpEngine()