示例#1
0
class NERTests(TestCase):
    def setUp(self):
        self.parser = NERScanner()

    def test_person(self):
        types = self.parser.scan("Jonathan is in the office")
        self.assertTrue(PiiTypes.PERSON in types)

    def test_location(self):
        types = self.parser.scan("Jonathan is in Bangalore")
        self.assertTrue(PiiTypes.LOCATION in types)
示例#2
0
class NERTests(TestCase):
    def setUp(self):
        self.parser = NERScanner()

    def test_person(self):
        types = self.parser.scan("Roger is in the office")
        self.assertTrue(PiiTypes.PERSON in types)

    def test_location(self):
        types = self.parser.scan("Jonathan is in Bangalore")
        self.assertTrue(PiiTypes.LOCATION in types)

    def test_date(self):
        types = self.parser.scan("Jan 1 2016 is a new year")
        self.assertTrue(PiiTypes.BIRTH_DATE in types)
示例#3
0
    def scan(self):
        logging.debug("Scanning %s" % self._path)
        if os.path.isfile(self._path):
            mime_type = magic.from_file(self._path, mime=True)
            self._files.append(File(self._path, mime_type))
            logging.debug(
                "\t- full path: %s, mime_type: %s"
                % (os.path.abspath(self._path), mime_type)
            )
        else:
            for root, subdirs, files in os.walk(self._path):
                for filename in files:
                    file_path = os.path.join(root, filename)
                    mime_type = magic.from_file(file_path, mime=True)

                    logging.debug(
                        "\t- full path: %s, mime_type: %s" % (file_path, mime_type)
                    )
                    self._files.append(File(file_path, mime_type))

        context = {
            "tokenizer": Tokenizer(),
            "regex": RegexScanner(),
            "ner": NERScanner(),
        }
        for f in self._files:
            f.scan(context)
示例#4
0
 def test_positive_scan_column(self):
     col = Column('col')
     col.scan('Jonathan Smith', [RegexScanner(), NERScanner()])
     self.assertTrue(col.has_pii())
     self.assertEqual({
         'pii_types': [PiiTypes.PERSON],
         'name': 'col'
     }, col.get_dict())
 def test_positive_scan_column(self):
     col = Column("col")
     col.scan("Jonathan Smith", [RegexScanner(), NERScanner()])
     self.assertTrue(col.has_pii())
     self.assertEqual({
         "pii_types": [PiiTypes.PERSON],
         "name": "col"
     }, col.get_dict())
示例#6
0
    def scan(self, generator):
        scanners = [RegexScanner(), NERScanner()]
        for row in generator(column_list=self._columns,
                             schema_name=self._schema,
                             table_name=self):
            for col, val in zip(self._columns, row):
                col.scan(val, scanners)

        for col in self._columns:
            [self._pii.add(p) for p in col.get_pii_types()]

        logging.debug(self._pii)
    def scan(self, generator):
        self.logger.debug("Scanning table name %s" % self.get_name())
        scanners = [RegexScanner(), NERScanner()]
        for row in generator(column_list=self.get_children(),
                             schema_name=self._schema,
                             table_name=self):
            for col, val in zip(self.get_children(), row):
                col.scan(val, scanners)

        for col in self.get_children():
            [self._pii.add(p) for p in col.get_pii_types()]

        self.logger.debug("%s has %s", self.get_name(),
                          self.get_pii_types_str())
示例#8
0
def scan_file_object(fd: TextIO) -> List[Any]:
    """

    Args:
        fd (file descriptor): A file descriptor open in text mode.

    Returns: A list of PIITypes enum of all the PII types found in the file.

    """
    scanner = IO("api file object", fd)
    context = {
        "tokenizer": Tokenizer(),
        "regex": RegexScanner(),
        "ner": NERScanner(),
    }

    scanner.scan(context)
    return scanner.get_pii_types()
示例#9
0
 def test_null_scan_column(self):
     col = Column('col')
     col.scan(None, [RegexScanner(), NERScanner()])
     self.assertFalse(col.has_pii())
     self.assertEqual({'pii_types': [], 'name': 'col'}, col.get_dict())
示例#10
0
 def setUp(self):
     self.parser = NERScanner()
 def test_null_scan_column(self):
     col = Column("col")
     col.scan(None, [RegexScanner(), NERScanner()])
     self.assertFalse(col.has_pii())
     self.assertEqual({"pii_types": [], "name": "col"}, col.get_dict())
示例#12
0
    def scan(self, context):
        for scanner in [RegexScanner(), NERScanner()]:
            [self._pii.add(pii) for pii in scanner.scan(context)]

        logging.debug(self._pii)