class UsSsn(field_type.FieldType): name = "US_SSN" context = [ "social", "security", # "sec", TODO: add keyphrase support in "social sec" "ssn", "ssns", "ssn#", "ss#", "ssid" ] # Master Regex: r'\b([0-9]{3})-?([0-9]{2})-?([0-9]{4})\b' patterns = [] pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b(([0-9]{5})-([0-9]{4})|([0-9]{3})-([0-9]{6}))\b' pattern.name = 'SSN (very weak)' pattern.strength = 0.05 patterns.append(pattern) pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b[0-9]{9}\b' pattern.name = 'SSN (weak)' pattern.strength = 0.3 patterns.append(pattern) pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b([0-9]{3})-([0-9]{2})-([0-9]{4})\b' pattern.name = 'SSN (medium)' pattern.strength = 0.5 patterns.append(pattern) patterns.sort(key=lambda p: p.strength, reverse=True)
class Phone(field_type.FieldType): name = "PHONE_NUMBER" context = ["phone", "number", "telephone", "cell", "mobile", "call"] patterns = [] # Strong pattern: e.g., (425) 882 8080, 425 882-8080, 425.882.8080 pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'(\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|d{3}[-\.\s]\d{3}[-\.\s]\d{4})' # noqa: E501 pattern.name = 'Phone (strong)' pattern.strength = 0.7 patterns.append(pattern) # Medium pattern: e.g., 425 8828080 pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b(\d{3}[-\.\s]\d{3}[-\.\s]??\d{4})\b' pattern.name = 'Phone (medium)' pattern.strength = 0.5 patterns.append(pattern) # Weak pattern: e.g., 4258828080 pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'(\b\d{10}\b)' pattern.name = 'Phone (weak)' pattern.strength = 0.05 patterns.append(pattern) patterns.sort(key=lambda p: p.strength, reverse=True)
class Iban(field_type.FieldType): name = "IBAN_CODE" context = ["iban"] should_check_checksum = True patterns = [] pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = u'[a-zA-Z]{2}[0-9]{2}[a-zA-Z0-9]{4}[0-9]{7}([a-zA-Z0-9]?){0,16}' # noqa: E501 pattern.name = 'Iban (Medium)' pattern.strength = 0.5 patterns.append(pattern) def check_checksum(self): LETTERS = { ord(d): str(i) for i, d in enumerate(string.digits + string.ascii_uppercase) } def __number_iban(iban): return (iban[4:] + iban[:4]).translate(LETTERS) def __generate_iban_check_digits(iban): number_iban = __number_iban(iban[:2] + '00' + iban[4:]) return '{:0>2}'.format(98 - (int(number_iban) % 97)) def __valid_iban(iban): return int(__number_iban(iban)) % 97 == 1 return __generate_iban_check_digits( self.text) == self.text[2:4] and __valid_iban(self.text)
class Crypto(field_type.FieldType): name = "CRYPTO" context = ["wallet", "btc", "bitcoin", "crypto"] should_check_checksum = True patterns = [] pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b[13][a-km-zA-HJ-NP-Z0-9]{26,33}\b' pattern.name = 'Crypto (Medium)' pattern.strength = 0.5 patterns.append(pattern) """Copied from: http://rosettacode.org/wiki/Bitcoin/address_validation#Python """ def __decode_base58(self, bc, length): digits58 = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' n = 0 for char in bc: n = n * 58 + digits58.index(char) return n.to_bytes(length, 'big') def check_checksum(self): # try: bcbytes = self.__decode_base58(self.text, 25) return bcbytes[-4:] == sha256(sha256( bcbytes[:-4]).digest()).digest()[:4]
class UsDriverLicense(field_type.FieldType): name = "US_DRIVER_LICENSE" context = [ "driver", "license", "permit", "id", "lic", "identification", "card", "cards", "dl", "dls", "cdls", "id", "lic#" ] # List from https://ntsi.com/drivers-license-format/ # --------------- patterns = [] # WA Driver License number is relatively unique as it also # includes '*' chars. # However it can also be 12 letters which makes every 12 letter' # word a match. Therefore we split WA driver license # regex: r'\b([A-Z][A-Z0-9*]{11})\b' into two regexes # With different weights, one to indicate letters only and # one to indicate at least one digit or one '*' pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b((?=.*\d)([A-Z][A-Z0-9*]{11})|(?=.*\*)([A-Z][A-Z0-9*]{11}))\b' # noqa: E501 pattern.name = 'Driver License - WA (weak) ' pattern.strength = 0.4 patterns.append(pattern) pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b([A-Z]{12})\b' pattern.name = 'Driver License - WA (very weak) ' pattern.strength = 0.0 patterns.append(pattern) pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b([A-Z][0-9]{3,6}|[A-Z][0-9]{5,9}|[A-Z][0-9]{6,8}|[A-Z][0-9]{4,8}|[A-Z][0-9]{9,11}|[A-Z]{1,2}[0-9]{5,6}|H[0-9]{8}|V[0-9]{6}|X[0-9]{8}|A-Z]{2}[0-9]{2,5}|[A-Z]{2}[0-9]{3,7}|[0-9]{2}[A-Z]{3}[0-9]{5,6}|[A-Z][0-9]{13,14}|[A-Z][0-9]{18}|[A-Z][0-9]{6}R|[A-Z][0-9]{9}|[A-Z][0-9]{1,12}|[0-9]{9}[A-Z]|[A-Z]{2}[0-9]{6}[A-Z]|[0-9]{8}[A-Z]{2}|[0-9]{3}[A-Z]{2}[0-9]{4}|[A-Z][0-9][A-Z][0-9][A-Z]|[0-9]{7,8}[A-Z])\b' # noqa: E501 pattern.name = 'Driver License - Alphanumeric (weak) ' pattern.strength = 0.3 patterns.append(pattern) pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b([0-9]{1,9}|[0-9]{4,10}|[0-9]{6,10}|[0-9]{1,12}|[0-9]{12,14}|[0-9]{16})\b' # noqa: E501 pattern.name = 'Driver License - Digits (very weak)' pattern.strength = 0.05 patterns.append(pattern) patterns.sort(key=lambda p: p.strength, reverse=True) '''
class Ip(field_type.FieldType): name = "IP_ADDRESS" context = ["ip", "ipv4", "ipv6"] patterns = [] pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b' # noqa: E501 pattern.name = 'IPv4' pattern.strength = 0.6 patterns.append(pattern) pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)|(?<!:)|(?<=:)(?<!::):)|(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)){3})\s*' # noqa: E501 pattern.name = 'IPv6' pattern.strength = 0.6 patterns.append(pattern) patterns.sort(key=lambda p: p.strength, reverse=True)
class UsPassport(field_type.FieldType): name = "US_PASSPORT" context = [ "us", "united", "states", "passport", "number", "passport#", "travel", "document" ] patterns = [] # Weak pattern: all passport numbers are a weak match, e.g., 14019033 pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'(\b[0-9]{9}\b)' pattern.name = 'Passport (very weak)' pattern.strength = 0.05 patterns.append(pattern)
class CreditCard(field_type.FieldType): name = "CREDIT_CARD" should_check_checksum = True context = [ "credit", "card", "visa", "mastercard", # "american express" #TODO: add after adding keyphrase support "amex", "discover", "jcb", "diners", "maestro", "instapayment" ] patterns = [] # All credit cards - weak pattern is used, since credit cards has checksum pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b((4\d{3})|(5[0-5]\d{2})|(6\d{3})|(1\d{3})|(3\d{3}))[- ]?(\d{3,4})[- ]?(\d{3,4})[- ]?(\d{3,5})\b' # noqa: E501 pattern.name = 'All Credit Cards (weak)' pattern.strength = 0.3 patterns.append(pattern) def __luhn_checksum(self): def digits_of(n): return [int(d) for d in str(n)] digits = digits_of(self.sanitized_value) odd_digits = digits[-1::-2] even_digits = digits[-2::-2] checksum = 0 checksum += sum(odd_digits) for d in even_digits: checksum += sum(digits_of(d * 2)) return checksum % 10 def __sanitize_value(self): self.sanitized_value = self.text.replace('-', '').replace(' ', '') def check_checksum(self): self.__sanitize_value() return self.__luhn_checksum() == 0
class Email(field_type.FieldType): name = "EMAIL_ADDRESS" should_check_checksum = True context = ["email"] patterns = [] pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r"\b((([!#$%&'*+\-/=?^_`{|}~\w])|([!#$%&'*+\-/=?^_`{|}~\w][!#$%&'*+\-/=?^_`{|}~\.\w]{0,}[!#$%&'*+\-/=?^_`{|}~\w]))[@]\w+([-.]\w+)*\.\w+([-.]\w+)*)\b" # noqa: E501 pattern.name = 'Email (Medium)' pattern.strength = 0.5 patterns.append(pattern) def check_checksum(self): result = tldextract.extract(self.text) if result.fqdn is not '': return True else: return False
class UsBank(field_type.FieldType): name = "US_BANK_NUMBER" context = [ "bank" # TODO: change to "checking account" as part of keyphrase change "checking", "account", "account#", "acct", "saving", "debit" ] patterns = [] # Weak pattern: all passport numbers are a weak match, e.g., 14019033 pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b[0-9]{8,17}\b' pattern.name = 'Bank Account (weak)' pattern.strength = 0.05 patterns.append(pattern)
class Domain(field_type.FieldType): name = "DOMAIN_NAME" should_check_checksum = True context = ["domain", "ip"] patterns = [] # Basic pattern, since domain has a checksum function pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b(((([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,86}[a-zA-Z0-9]))\.(([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,73}[a-zA-Z0-9]))\.(([a-zA-Z0-9]{2,12}\.[a-zA-Z0-9]{2,12})|([a-zA-Z0-9]{2,25})))|((([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,162}[a-zA-Z0-9]))\.(([a-zA-Z0-9]{2,12}\.[a-zA-Z0-9]{2,12})|([a-zA-Z0-9]{2,25}))))\b' # noqa: E501 pattern.name = 'Domain ()' pattern.strength = 0.5 patterns.append(pattern) def check_checksum(self): result = tldextract.extract(self.text) if result.fqdn is not '': return True else: return False
class UkNhs(field_type.FieldType): name = "UK_NHS" should_check_checksum = True context = [ "national health service", "nhs", "health services authority", "health authority" ] patterns = [] pattern = field_regex_pattern.RegexFieldPattern() pattern.regex = r'\b([0-9]{3})[- ]?([0-9]{3})[- ]?([0-9]{4})\b' pattern.name = 'NHS (medium)' pattern.strength = 0.5 patterns.append(pattern) patterns.sort(key=lambda p: p.strength, reverse=True) def __sanitize_value(self): self.sanitized_value = self.text.replace('-', '').replace(' ', '') def check_checksum(self): self.__sanitize_value() multiplier = 10 total = 0 for c in self.sanitized_value: val = int(c) total = total + val * multiplier multiplier = multiplier - 1 remainder = total % 11 check_digit = 11 - remainder if check_digit is 11: return True return False