class NERTests(TestCase): def setUp(self): self.parser = NERScanner() def test_person(self): types = self.parser.scan("Jonathan is in the office") self.assertTrue(PiiTypes.PERSON in types) def test_location(self): types = self.parser.scan("Jonathan is in Bangalore") self.assertTrue(PiiTypes.LOCATION in types)
class NERTests(TestCase): def setUp(self): self.parser = NERScanner() def test_person(self): types = self.parser.scan("Roger is in the office") self.assertTrue(PiiTypes.PERSON in types) def test_location(self): types = self.parser.scan("Jonathan is in Bangalore") self.assertTrue(PiiTypes.LOCATION in types) def test_date(self): types = self.parser.scan("Jan 1 2016 is a new year") self.assertTrue(PiiTypes.BIRTH_DATE in types)
def scan(self): logging.debug("Scanning %s" % self._path) if os.path.isfile(self._path): mime_type = magic.from_file(self._path, mime=True) self._files.append(File(self._path, mime_type)) logging.debug( "\t- full path: %s, mime_type: %s" % (os.path.abspath(self._path), mime_type) ) else: for root, subdirs, files in os.walk(self._path): for filename in files: file_path = os.path.join(root, filename) mime_type = magic.from_file(file_path, mime=True) logging.debug( "\t- full path: %s, mime_type: %s" % (file_path, mime_type) ) self._files.append(File(file_path, mime_type)) context = { "tokenizer": Tokenizer(), "regex": RegexScanner(), "ner": NERScanner(), } for f in self._files: f.scan(context)
def test_positive_scan_column(self): col = Column('col') col.scan('Jonathan Smith', [RegexScanner(), NERScanner()]) self.assertTrue(col.has_pii()) self.assertEqual({ 'pii_types': [PiiTypes.PERSON], 'name': 'col' }, col.get_dict())
def test_positive_scan_column(self): col = Column("col") col.scan("Jonathan Smith", [RegexScanner(), NERScanner()]) self.assertTrue(col.has_pii()) self.assertEqual({ "pii_types": [PiiTypes.PERSON], "name": "col" }, col.get_dict())
def scan(self, generator): scanners = [RegexScanner(), NERScanner()] for row in generator(column_list=self._columns, schema_name=self._schema, table_name=self): for col, val in zip(self._columns, row): col.scan(val, scanners) for col in self._columns: [self._pii.add(p) for p in col.get_pii_types()] logging.debug(self._pii)
def scan(self, generator): self.logger.debug("Scanning table name %s" % self.get_name()) scanners = [RegexScanner(), NERScanner()] for row in generator(column_list=self.get_children(), schema_name=self._schema, table_name=self): for col, val in zip(self.get_children(), row): col.scan(val, scanners) for col in self.get_children(): [self._pii.add(p) for p in col.get_pii_types()] self.logger.debug("%s has %s", self.get_name(), self.get_pii_types_str())
def scan_file_object(fd: TextIO) -> List[Any]: """ Args: fd (file descriptor): A file descriptor open in text mode. Returns: A list of PIITypes enum of all the PII types found in the file. """ scanner = IO("api file object", fd) context = { "tokenizer": Tokenizer(), "regex": RegexScanner(), "ner": NERScanner(), } scanner.scan(context) return scanner.get_pii_types()
def test_null_scan_column(self): col = Column('col') col.scan(None, [RegexScanner(), NERScanner()]) self.assertFalse(col.has_pii()) self.assertEqual({'pii_types': [], 'name': 'col'}, col.get_dict())
def setUp(self): self.parser = NERScanner()
def test_null_scan_column(self): col = Column("col") col.scan(None, [RegexScanner(), NERScanner()]) self.assertFalse(col.has_pii()) self.assertEqual({"pii_types": [], "name": "col"}, col.get_dict())
def scan(self, context): for scanner in [RegexScanner(), NERScanner()]: [self._pii.add(pii) for pii in scanner.scan(context)] logging.debug(self._pii)