def __init__(self, supported_entities, name=None, supported_language="en", version="0.0.1"): """ An abstract class to be inherited by Recognizers which hold the logic for recognizing specific PII entities. :param supported_entities: the entities supported by this recognizer (for example, phone number, address, etc.) :param supported_language: the language supported by this recognizer. The supported langauge code is iso6391Name :param name: the name of this recognizer (optional) :param version: the recognizer current version """ self.supported_entities = supported_entities if name is None: self.name = self.__class__.__name__ # assign class name as name else: self.name = name self.supported_language = supported_language self.version = version self.is_loaded = False self.logger = PresidioLogger() self.load() self.logger.info("Loaded recognizer: %s", self.name) self.is_loaded = True
class AppTracer: """This class provides the ability to log/trace the system's decisions, such as which modules were used for detection, which logic was utilized, what results were given and potentially why. This can be useful for analyzing the detection accuracy of the system.""" def __init__(self, enabled=True): self.logger = PresidioLogger('Interpretability') self.logger.set_level("INFO") self.enabled = enabled def trace(self, request_id, trace_data): """ Writes a value associated with a decision for a specific request into the trace, for further inspection if needed. :param request_id: A unique ID, to correlate across calls. :param trace_data: A string to write. :return: """ if self.enabled: self.logger.info("[%s][%s]", request_id, trace_data)
import spacy from presidio_analyzer import PresidioLogger from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine logger = PresidioLogger() class SpacyNlpEngine(NlpEngine): """ SpacyNlpEngine is an abstraction layer over the nlp module. It provides processing functionality as well as other queries on tokens. The SpacyNlpEngine uses SpaCy as its NLP module """ def __init__(self): logger.info("Loading NLP model: spaCy en_core_web_lg") self.nlp = { "en": spacy.load("en_core_web_lg", disable=['parser', 'tagger']) } logger.info("Printing spaCy model and package details:" "\n\n {}\n\n".format(spacy.info("en_core_web_lg"))) def process_text(self, text, language): """ Execute the SpaCy NLP pipeline on the given text and language """ doc = self.nlp[language](text) return self.doc_to_nlp_artifact(doc, language)
import os import grpc from presidio_analyzer.protobuf_models import ( recognizers_store_pb2, recognizers_store_pb2_grpc, ) from presidio_analyzer import PatternRecognizer, Pattern, PresidioLogger logger = PresidioLogger("presidio") class RecognizerStoreApi: """ The RecognizerStoreApi is the object that talks to the remote recognizers store service and get the recognizers / hash """ def __init__(self): try: recognizers_store_svc_url = \ os.environ["RECOGNIZERS_STORE_SVC_ADDRESS"] except KeyError: recognizers_store_svc_url = "localhost:3004" channel = grpc.insecure_channel(recognizers_store_svc_url) self.rs_stub = recognizers_store_pb2_grpc.RecognizersStoreServiceStub( channel) def get_latest_hash(self): """ Returns the hash of all the stored custom recognizers. Returns empty
def __init__(self, enabled=True): self.logger = PresidioLogger('Interpretability') self.logger.set_level("INFO") self.enabled = enabled
from knack.help import CLIHelp from knack.help_files import helps sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from presidio_analyzer.protobuf_models import analyze_pb2, analyze_pb2_grpc # noqa from presidio_analyzer import AnalyzerEngine, PresidioLogger, RecognizerRegistry # noqa from presidio_analyzer.nlp_engine import SpacyNlpEngine # noqa log_level_name = os.environ.get('LOG_LEVEL', 'INFO') log_level = logging.INFO if log_level_name == 'WARNING': log_level = logging.WARNING if log_level_name == 'ERROR': log_level = logging.ERROR logger = PresidioLogger("presidio") logger.set_level(log_level) WELCOME_MESSAGE = r""" _______ _______ _______ _______ _________ ______ _________ _______ ( ____ )( ____ )( ____ \( ____ \\__ __/( __ \ \__ __/( ___ ) | ( )|| ( )|| ( \/| ( \/ ) ( | ( \ ) ) ( | ( ) | | (____)|| (____)|| (__ | (_____ | | | | ) | | | | | | | | _____)| __)| __) (_____ ) | | | | | | | | | | | | | ( | (\ ( | ( ) | | | | | ) | | | | | | | | ) | ) \ \__| (____/\/\____) |___) (___| (__/ )___) (___| (___) | |/ |/ \__/(_______/\_______)\_______/(______/ \_______/(_______) """
class EntityRecognizer: MIN_SCORE = 0 MAX_SCORE = 1.0 CONTEXT_SIMILARITY_THRESHOLD = 0.65 CONTEXT_SIMILARITY_FACTOR = 0.35 MIN_SCORE_WITH_CONTEXT_SIMILARITY = 0.4 CONTEXT_PREFIX_COUNT = 5 CONTEXT_SUFFIX_COUNT = 0 def __init__(self, supported_entities, name=None, supported_language="en", version="0.0.1"): """ An abstract class to be inherited by Recognizers which hold the logic for recognizing specific PII entities. :param supported_entities: the entities supported by this recognizer (for example, phone number, address, etc.) :param supported_language: the language supported by this recognizer. The supported langauge code is iso6391Name :param name: the name of this recognizer (optional) :param version: the recognizer current version """ self.supported_entities = supported_entities if name is None: self.name = self.__class__.__name__ # assign class name as name else: self.name = name self.supported_language = supported_language self.version = version self.is_loaded = False self.logger = PresidioLogger() self.load() self.logger.info("Loaded recognizer: %s", self.name) self.is_loaded = True @abstractmethod def load(self): """ Initialize the recognizer assets if needed (e.g. machine learning models) """ @abstractmethod def analyze(self, text, entities, nlp_artifacts): """ This is the core method for analyzing text, assuming entities are the subset of the supported entities types. :param text: The text to be analyzed :param entities: The list of entities to be detected :param nlp_artifacts: Value of type NlpArtifacts. A group of attributes which are the result of some NLP process over the matching text :return: list of RecognizerResult :rtype: [RecognizerResult] """ return None def get_supported_entities(self): """ :return: A list of the supported entities by this recognizer """ return self.supported_entities def get_supported_language(self): """ :return: A list of the supported language by this recognizer """ return self.supported_language def get_version(self): """ :return: The current version of this recognizer """ return self.version def to_dict(self): return_dict = { "supported_entities": self.supported_entities, "supported_language": self.supported_language, "name": self.name, "version": self.version } return return_dict @classmethod def from_dict(cls, entity_recognizer_dict): return cls(**entity_recognizer_dict) def enhance_using_context(self, text, raw_results, nlp_artifacts, recognizer_context_words): """ using the surrounding words of the actual word matches, look for specific strings that if found contribute to the score of the result, improving the confidence that the match is indeed of that PII entity type :param text: The actual text that was analyzed :param raw_results: Recognizer results which didn't take context into consideration :param nlp_artifacts: The nlp artifacts contains elements such as lemmatized tokens for better accuracy of the context enhancement process :param recognizer_context_words: The words the current recognizer supports (words to lookup) """ # create a deep copy of the results object so we can manipulate it results = copy.deepcopy(raw_results) # Sanity if nlp_artifacts is None: self.logger.warning('[%s]. NLP artifacts were not provided', self.name) return results if recognizer_context_words is None or recognizer_context_words == []: self.logger.info( "recognizer '%s' does not support context " "enhancement", self.name) return results for result in results: # extract lemmatized context from the surrounding of the match word = text[result.start:result.end] surrounding_words = self.__extract_surrounding_words( nlp_artifacts=nlp_artifacts, word=word, start=result.start) supportive_context_word = self.__find_supportive_word_in_context( surrounding_words, recognizer_context_words) if supportive_context_word != "": result.score += \ self.CONTEXT_SIMILARITY_FACTOR result.score = max(result.score, self.MIN_SCORE_WITH_CONTEXT_SIMILARITY) result.score = min(result.score, EntityRecognizer.MAX_SCORE) # Update the explainability object with context information # helped improving the score result.analysis_explanation.set_supportive_context_word( supportive_context_word) result.analysis_explanation.set_improved_score(result.score) return results @staticmethod def __context_to_keywords(context): return context.split(' ') def __find_supportive_word_in_context(self, context_list, recognizer_context_list): """A word is considered a supportive context word if there's exact match between a keyword in context_text and any keyword in context_list :param context_list words before and after the matched entity within a specified window size :param recognizer_context_list a list of words considered as context keywords manually specified by the recognizer's author """ word = "" # If the context list is empty, no need to continue if context_list is None or recognizer_context_list is None: return word for predefined_context_word in recognizer_context_list: # result == true only if any of the predefined context words # is found exactly or as a substring in any of the collected # context words result = \ next((True for keyword in context_list if predefined_context_word in keyword), False) if result: self.logger.debug("Found context keyword '%s'", predefined_context_word) word = predefined_context_word break return word @staticmethod def __add_n_words(index, n_words, lemmas, lemmatized_filtered_keywords, is_backward): """ Prepare a string of context words, which surrounds a lemma at a given index. The words will be collected only if exist in the filtered array :param index: index of the lemma that its surrounding words we want :param n_words: number of words to take :param lemmas: array of lemmas :param lemmatized_filtered_keywords: the array of filtered lemmas from the original sentence, :param is_backward: if true take the preceeding words, if false, take the successing words """ i = index context_words = [] # The entity itself is no interest to us...however we want to # consider it anyway for cases were it is attached with no spaces # to an interesting context word, so we allow it and add 1 to # the number of collected words # collect at most n words (in lower case) remaining = n_words + 1 while 0 <= i < len(lemmas) and remaining > 0: lower_lemma = lemmas[i].lower() if lower_lemma in lemmatized_filtered_keywords: context_words.append(lower_lemma) remaining -= 1 i = i - 1 if is_backward else i + 1 return context_words def __add_n_words_forward(self, index, n_words, lemmas, lemmatized_filtered_keywords): return self.__add_n_words(index, n_words, lemmas, lemmatized_filtered_keywords, False) def __add_n_words_backward(self, index, n_words, lemmas, lemmatized_filtered_keywords): return self.__add_n_words(index, n_words, lemmas, lemmatized_filtered_keywords, True) @staticmethod def find_index_of_match_token(word, start, tokens, tokens_indices): found = False # we use the known start index of the original word to find the actual # token at that index, we are not checking for equivilance since the # token might be just a substring of that word (e.g. for phone number # 555-124564 the first token might be just '555' or for a match like ' # rocket' the actual token will just be 'rocket' hence the misalignment # of indices) # Note: we are iterating over the original tokens (not the lemmatized) i = -1 for i, token in enumerate(tokens, 0): # Either we found a token with the exact location, or # we take a token which its characters indices covers # the index we are looking for. if ((tokens_indices[i] == start) or (start < tokens_indices[i] + len(token))): # found the interesting token, the one that around it # we take n words, we save the matching lemma found = True break if not found: raise ValueError("Did not find word '" + word + "' " "in the list of tokens although it " "is expected to be found") return i def __extract_surrounding_words(self, nlp_artifacts, word, start): """ Extracts words surrounding another given word. The text from which the context is extracted is given in the nlp doc :param nlp_artifacts: An abstraction layer which holds different items which are the result of a NLP pipeline execution on a given text :param word: The word to look for context around :param start: The start index of the word in the original text """ if not nlp_artifacts.tokens: self.logger.info('Skipping context extraction due to ' 'lack of NLP artifacts') # if there are no nlp artifacts, this is ok, we can # extract context and we return a valid, yet empty # context return '' # Get the already prepared words in the given text, in their # LEMMATIZED version lemmatized_keywords = nlp_artifacts.keywords # since the list of tokens is not necessarily aligned # with the actual index of the match, we look for the # token index which corresponds to the match token_index = EntityRecognizer.find_index_of_match_token( word, start, nlp_artifacts.tokens, nlp_artifacts.tokens_indices) # index i belongs to the PII entity, take the preceding n words # and the successing m words into a context list backward_context = \ self.__add_n_words_backward(token_index, EntityRecognizer.CONTEXT_PREFIX_COUNT, nlp_artifacts.lemmas, lemmatized_keywords) forward_context = \ self.__add_n_words_forward(token_index, EntityRecognizer.CONTEXT_SUFFIX_COUNT, nlp_artifacts.lemmas, lemmatized_keywords) context_list = [] context_list.extend(backward_context) context_list.extend(forward_context) context_list = list(set(context_list)) self.logger.debug('Context list is: %s', " ".join(context_list)) return context_list