Exemplo n.º 1
0
class UsSsn(field_type.FieldType):
    name = "US_SSN"
    context = [
        "social",
        "security",
        # "sec", TODO: add keyphrase support in "social sec"
        "ssn",
        "ssns",
        "ssn#",
        "ss#",
        "ssid"
    ]

    # Master Regex: r'\b([0-9]{3})-?([0-9]{2})-?([0-9]{4})\b'
    patterns = []

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b(([0-9]{5})-([0-9]{4})|([0-9]{3})-([0-9]{6}))\b'
    pattern.name = 'SSN (very weak)'
    pattern.strength = 0.05
    patterns.append(pattern)

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b[0-9]{9}\b'
    pattern.name = 'SSN (weak)'
    pattern.strength = 0.3
    patterns.append(pattern)

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b([0-9]{3})-([0-9]{2})-([0-9]{4})\b'
    pattern.name = 'SSN (medium)'
    pattern.strength = 0.5
    patterns.append(pattern)

    patterns.sort(key=lambda p: p.strength, reverse=True)
Exemplo n.º 2
0
class Phone(field_type.FieldType):
    name = "PHONE_NUMBER"
    context = ["phone", "number", "telephone", "cell", "mobile", "call"]
    patterns = []

    # Strong pattern: e.g., (425) 882 8080, 425 882-8080, 425.882.8080
    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'(\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|d{3}[-\.\s]\d{3}[-\.\s]\d{4})'  # noqa: E501
    pattern.name = 'Phone (strong)'
    pattern.strength = 0.7
    patterns.append(pattern)

    # Medium pattern: e.g., 425 8828080
    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b(\d{3}[-\.\s]\d{3}[-\.\s]??\d{4})\b'
    pattern.name = 'Phone (medium)'
    pattern.strength = 0.5
    patterns.append(pattern)

    # Weak pattern: e.g., 4258828080
    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'(\b\d{10}\b)'
    pattern.name = 'Phone (weak)'
    pattern.strength = 0.05
    patterns.append(pattern)

    patterns.sort(key=lambda p: p.strength, reverse=True)
Exemplo n.º 3
0
class Iban(field_type.FieldType):
    name = "IBAN_CODE"
    context = ["iban"]
    should_check_checksum = True

    patterns = []

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = u'[a-zA-Z]{2}[0-9]{2}[a-zA-Z0-9]{4}[0-9]{7}([a-zA-Z0-9]?){0,16}'  # noqa: E501
    pattern.name = 'Iban (Medium)'
    pattern.strength = 0.5
    patterns.append(pattern)

    def check_checksum(self):
        LETTERS = {
            ord(d): str(i)
            for i, d in enumerate(string.digits + string.ascii_uppercase)
        }

        def __number_iban(iban):
            return (iban[4:] + iban[:4]).translate(LETTERS)

        def __generate_iban_check_digits(iban):
            number_iban = __number_iban(iban[:2] + '00' + iban[4:])
            return '{:0>2}'.format(98 - (int(number_iban) % 97))

        def __valid_iban(iban):
            return int(__number_iban(iban)) % 97 == 1

        return __generate_iban_check_digits(
            self.text) == self.text[2:4] and __valid_iban(self.text)
Exemplo n.º 4
0
class Crypto(field_type.FieldType):
    name = "CRYPTO"
    context = ["wallet", "btc", "bitcoin", "crypto"]

    should_check_checksum = True

    patterns = []

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b[13][a-km-zA-HJ-NP-Z0-9]{26,33}\b'
    pattern.name = 'Crypto (Medium)'
    pattern.strength = 0.5
    patterns.append(pattern)
    """Copied from:
    http://rosettacode.org/wiki/Bitcoin/address_validation#Python
    """

    def __decode_base58(self, bc, length):
        digits58 = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
        n = 0
        for char in bc:
            n = n * 58 + digits58.index(char)
        return n.to_bytes(length, 'big')

    def check_checksum(self):
        # try:
        bcbytes = self.__decode_base58(self.text, 25)
        return bcbytes[-4:] == sha256(sha256(
            bcbytes[:-4]).digest()).digest()[:4]
Exemplo n.º 5
0
class UsDriverLicense(field_type.FieldType):

    name = "US_DRIVER_LICENSE"
    context = [
        "driver", "license", "permit", "id", "lic", "identification", "card",
        "cards", "dl", "dls", "cdls", "id", "lic#"
    ]

    # List from https://ntsi.com/drivers-license-format/
    # ---------------
    patterns = []

    # WA Driver License number is relatively unique as it also
    # includes '*' chars.
    # However it can also be 12 letters which makes every 12 letter'
    # word a match. Therefore we split WA driver license
    # regex: r'\b([A-Z][A-Z0-9*]{11})\b' into two regexes
    # With different weights, one to indicate letters only and
    # one to indicate at least one digit or one '*'
    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b((?=.*\d)([A-Z][A-Z0-9*]{11})|(?=.*\*)([A-Z][A-Z0-9*]{11}))\b'  # noqa: E501
    pattern.name = 'Driver License - WA (weak) '
    pattern.strength = 0.4
    patterns.append(pattern)

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b([A-Z]{12})\b'
    pattern.name = 'Driver License - WA (very weak) '
    pattern.strength = 0.0
    patterns.append(pattern)

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b([A-Z][0-9]{3,6}|[A-Z][0-9]{5,9}|[A-Z][0-9]{6,8}|[A-Z][0-9]{4,8}|[A-Z][0-9]{9,11}|[A-Z]{1,2}[0-9]{5,6}|H[0-9]{8}|V[0-9]{6}|X[0-9]{8}|A-Z]{2}[0-9]{2,5}|[A-Z]{2}[0-9]{3,7}|[0-9]{2}[A-Z]{3}[0-9]{5,6}|[A-Z][0-9]{13,14}|[A-Z][0-9]{18}|[A-Z][0-9]{6}R|[A-Z][0-9]{9}|[A-Z][0-9]{1,12}|[0-9]{9}[A-Z]|[A-Z]{2}[0-9]{6}[A-Z]|[0-9]{8}[A-Z]{2}|[0-9]{3}[A-Z]{2}[0-9]{4}|[A-Z][0-9][A-Z][0-9][A-Z]|[0-9]{7,8}[A-Z])\b'  # noqa: E501
    pattern.name = 'Driver License - Alphanumeric (weak) '
    pattern.strength = 0.3
    patterns.append(pattern)

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b([0-9]{1,9}|[0-9]{4,10}|[0-9]{6,10}|[0-9]{1,12}|[0-9]{12,14}|[0-9]{16})\b'  # noqa: E501
    pattern.name = 'Driver License - Digits (very weak)'
    pattern.strength = 0.05
    patterns.append(pattern)
    patterns.sort(key=lambda p: p.strength, reverse=True)
    '''
Exemplo n.º 6
0
class Ip(field_type.FieldType):
    name = "IP_ADDRESS"
    context = ["ip", "ipv4", "ipv6"]

    patterns = []

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'  # noqa: E501
    pattern.name = 'IPv4'
    pattern.strength = 0.6
    patterns.append(pattern)

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)|(?<!:)|(?<=:)(?<!::):)|(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)){3})\s*'  # noqa: E501
    pattern.name = 'IPv6'
    pattern.strength = 0.6
    patterns.append(pattern)

    patterns.sort(key=lambda p: p.strength, reverse=True)
Exemplo n.º 7
0
class UsPassport(field_type.FieldType):
    name = "US_PASSPORT"
    context = [
        "us", "united", "states", "passport", "number", "passport#", "travel",
        "document"
    ]

    patterns = []

    # Weak pattern: all passport numbers are a weak match, e.g., 14019033
    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'(\b[0-9]{9}\b)'
    pattern.name = 'Passport (very weak)'
    pattern.strength = 0.05
    patterns.append(pattern)
Exemplo n.º 8
0
class CreditCard(field_type.FieldType):
    name = "CREDIT_CARD"
    should_check_checksum = True
    context = [
        "credit",
        "card",
        "visa",
        "mastercard",
        # "american express" #TODO: add after adding keyphrase support
        "amex",
        "discover",
        "jcb",
        "diners",
        "maestro",
        "instapayment"
    ]

    patterns = []

    # All credit cards - weak pattern is used, since credit cards has checksum
    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b((4\d{3})|(5[0-5]\d{2})|(6\d{3})|(1\d{3})|(3\d{3}))[- ]?(\d{3,4})[- ]?(\d{3,4})[- ]?(\d{3,5})\b'  # noqa: E501
    pattern.name = 'All Credit Cards (weak)'
    pattern.strength = 0.3
    patterns.append(pattern)

    def __luhn_checksum(self):
        def digits_of(n):
            return [int(d) for d in str(n)]

        digits = digits_of(self.sanitized_value)
        odd_digits = digits[-1::-2]
        even_digits = digits[-2::-2]
        checksum = 0
        checksum += sum(odd_digits)
        for d in even_digits:
            checksum += sum(digits_of(d * 2))
        return checksum % 10

    def __sanitize_value(self):
        self.sanitized_value = self.text.replace('-', '').replace(' ', '')

    def check_checksum(self):
        self.__sanitize_value()
        return self.__luhn_checksum() == 0
Exemplo n.º 9
0
class Email(field_type.FieldType):
    name = "EMAIL_ADDRESS"
    should_check_checksum = True
    context = ["email"]

    patterns = []

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r"\b((([!#$%&'*+\-/=?^_`{|}~\w])|([!#$%&'*+\-/=?^_`{|}~\w][!#$%&'*+\-/=?^_`{|}~\.\w]{0,}[!#$%&'*+\-/=?^_`{|}~\w]))[@]\w+([-.]\w+)*\.\w+([-.]\w+)*)\b"  # noqa: E501
    pattern.name = 'Email (Medium)'
    pattern.strength = 0.5
    patterns.append(pattern)

    def check_checksum(self):
        result = tldextract.extract(self.text)
        if result.fqdn is not '':
            return True
        else:
            return False
Exemplo n.º 10
0
class UsBank(field_type.FieldType):
    name = "US_BANK_NUMBER"
    context = [
        "bank"
        # TODO: change to "checking account" as part of keyphrase change
        "checking",
        "account",
        "account#",
        "acct",
        "saving",
        "debit"
    ]

    patterns = []

    # Weak pattern: all passport numbers are a weak match, e.g., 14019033
    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b[0-9]{8,17}\b'
    pattern.name = 'Bank Account (weak)'
    pattern.strength = 0.05
    patterns.append(pattern)
Exemplo n.º 11
0
class Domain(field_type.FieldType):
    name = "DOMAIN_NAME"
    should_check_checksum = True
    context = ["domain", "ip"]

    patterns = []

    # Basic pattern, since domain has a checksum function
    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b(((([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,86}[a-zA-Z0-9]))\.(([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,73}[a-zA-Z0-9]))\.(([a-zA-Z0-9]{2,12}\.[a-zA-Z0-9]{2,12})|([a-zA-Z0-9]{2,25})))|((([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,162}[a-zA-Z0-9]))\.(([a-zA-Z0-9]{2,12}\.[a-zA-Z0-9]{2,12})|([a-zA-Z0-9]{2,25}))))\b'  # noqa: E501
    pattern.name = 'Domain ()'
    pattern.strength = 0.5

    patterns.append(pattern)

    def check_checksum(self):
        result = tldextract.extract(self.text)
        if result.fqdn is not '':
            return True
        else:
            return False
Exemplo n.º 12
0
class UkNhs(field_type.FieldType):
    name = "UK_NHS"
    should_check_checksum = True
    context = [
        "national health service", "nhs", "health services authority",
        "health authority"
    ]

    patterns = []

    pattern = field_regex_pattern.RegexFieldPattern()
    pattern.regex = r'\b([0-9]{3})[- ]?([0-9]{3})[- ]?([0-9]{4})\b'
    pattern.name = 'NHS (medium)'
    pattern.strength = 0.5
    patterns.append(pattern)

    patterns.sort(key=lambda p: p.strength, reverse=True)

    def __sanitize_value(self):
        self.sanitized_value = self.text.replace('-', '').replace(' ', '')

    def check_checksum(self):
        self.__sanitize_value()

        multiplier = 10
        total = 0
        for c in self.sanitized_value:
            val = int(c)
            total = total + val * multiplier
            multiplier = multiplier - 1

        remainder = total % 11
        check_digit = 11 - remainder
        if check_digit is 11:
            return True

        return False