def test_std3(self): self.assertEqual(idna.uts46_remap('A_', std3_rules=False), 'a_') self.assertRaises(idna.InvalidCodepoint, idna.uts46_remap, 'A_', std3_rules=True)
def punycode_special_host(url): if url.host and url.scheme in urlcanon.SPECIAL_SCHEMES: # https://github.com/kjd/idna/issues/40#issuecomment-285496926 try: url.host = idna.encode(url.host.decode('utf-8'), uts46=True) except: try: remapped = idna.uts46_remap(url.host.decode('utf-8')) labels = remapped.split('.') punycode_labels = [idna2003.ToASCII(label) for label in labels] url.host = b'.'.join(punycode_labels) except: pass
def _domain_to_ascii(domain, strict=False): """Attempt to encode with IDNA 2008 first, if that fails then attempt to encode with IDNA 2003. """ try: return idna.encode(domain, strict=strict, std3_rules=strict, uts46=True, transitional=False) except idna.IDNAError: if isinstance(domain, (bytes, bytearray)): domain = domain.decode("ascii") domain = idna.uts46_remap(domain, std3_rules=strict, transitional=False) trailing_dot = False result = [] if strict: labels = domain.split(".") else: labels = IDNA_DOTS_REGEX.split(domain) if not labels or labels == [""]: raise idna.IDNAError("Empty domain") if labels[-1] == "": del labels[-1] trailing_dot = True for label in labels: try: s = idna2003.ToASCII(label) except UnicodeError: if strict: raise result.append(label.encode("utf-8")) continue if s: result.append(s) else: raise idna.IDNAError("Empty label") if trailing_dot: result.append(b"") s = b".".join(result) if not idna.valid_string_length(s, trailing_dot): raise idna.IDNAError("Domain too long") return s
def normalize_name(name: str) -> str: """ Clean the fully qualified name, as defined in ENS `EIP-137 <https://github.com/ethereum/EIPs/blob/master/EIPS/eip-137.md#name-syntax>`_ This does *not* enforce whether ``name`` is a label or fully qualified domain. :param str name: the dot-separated ENS name :raises InvalidName: if ``name`` has invalid syntax """ if not name: return name elif isinstance(name, (bytes, bytearray)): name = name.decode('utf-8') try: return idna.uts46_remap(name, std3_rules=True, transitional=False) except idna.IDNAError as exc: raise InvalidName(f"{name} is an invalid name, because {exc}") from exc
def validate_email_domain_part(domain): # Empty? if len(domain) == 0: raise EmailSyntaxError("There must be something after the @-sign.") # Perform UTS-46 normalization, which includes casefolding, NFC normalization, # and converting all label separators (the period/full stop, fullwidth full stop, # ideographic full stop, and halfwidth ideographic full stop) to basic periods. # It will also raise an exception if there is an invalid character in the input, # such as "⒈" which is invalid because it would expand to include a period. try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: raise EmailSyntaxError( "The domain name %s contains invalid characters (%s)." % (domain, str(e))) # Now we can perform basic checks on the use of periods (since equivalent # symbols have been mapped to periods). These checks are needed because the # IDNA library doesn't handle well domains that have empty labels (i.e. initial # dot, trailing dot, or two dots in a row). if domain.endswith("."): raise EmailSyntaxError("An email address cannot end with a period.") if domain.startswith("."): raise EmailSyntaxError( "An email address cannot have a period immediately after the @-sign." ) if ".." in domain: raise EmailSyntaxError( "An email address cannot have two periods in a row.") # Regardless of whether international characters are actually used, # first convert to IDNA ASCII. For ASCII-only domains, the transformation # does nothing. If internationalized characters are present, the MTA # must either support SMTPUTF8 or the mail client must convert the # domain name to IDNA before submission. # # Unfortunately this step incorrectly 'fixes' domain names with leading # periods by removing them, so we have to check for this above. It also gives # a funky error message ("No input") when there are two periods in a # row, also checked separately above. try: ascii_domain = idna.encode(domain, uts46=False).decode("ascii") except idna.IDNAError as e: if "Domain too long" in str(e): # We can't really be more specific because UTS-46 normalization means # the length check is applied to a string that is different from the # one the user supplied. Also I'm not sure if the length check applies # to the internationalized form, the IDNA ASCII form, or even both! raise EmailSyntaxError( "The email address is too long after the @-sign.") raise EmailSyntaxError( "The domain name %s contains invalid characters (%s)." % (domain, str(e))) # We may have been given an IDNA ASCII domain to begin with. Check # that the domain actually conforms to IDNA. It could look like IDNA # but not be actual IDNA. For ASCII-only domains, the conversion out # of IDNA just gives the same thing back. # # This gives us the canonical internationalized form of the domain, # which we should use in all error messages. try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: raise EmailSyntaxError("The domain name %s is not valid IDNA (%s)." % (ascii_domain, str(e))) # RFC 5321 4.5.3.1.2 # We're checking the number of bytes (octets) here, which can be much # higher than the number of characters in internationalized domains, # on the assumption that the domain may be transmitted without SMTPUTF8 # as IDNA ASCII. This is also checked by idna.encode, so this exception # is never reached. if len(ascii_domain) > DOMAIN_MAX_LENGTH: raise EmailSyntaxError( "The email address is too long after the @-sign.") # A "dot atom text", per RFC 2822 3.2.4, but using the restricted # characters allowed in a hostname (see ATEXT_HOSTNAME above). DOT_ATOM_TEXT = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*' # Check the regular expression. This is probably entirely redundant # with idna.decode, which also checks this format. m = re.match(DOT_ATOM_TEXT + "\\Z", ascii_domain) if not m: raise EmailSyntaxError( "The email address contains invalid characters after the @-sign.") # All publicly deliverable addresses have domain named with at least # one period. We also know that all TLDs end with a letter. if "." not in ascii_domain: raise EmailSyntaxError( "The domain name %s is not valid. It should have a period." % domain_i18n) if not re.search(r"[A-Za-z]\Z", ascii_domain): raise EmailSyntaxError( "The domain name %s is not valid. It is not within a valid top-level domain." % domain_i18n) # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 # possibly), as well as the canonical Unicode form of the domain, # which is better for display purposes. This should also take care # of RFC 6532 section 3.1's suggestion to apply Unicode NFC # normalization to addresses. return { "ascii_domain": ascii_domain, "domain": domain_i18n, }
def _validate_email_address( value, allow_unnormalized, allow_smtputf8, required, ): if value is None: if required: raise TypeError("required value is None") return if not isinstance(value, six.text_type): raise TypeError( ("expected unicode string, but value is of type {cls!r}").format( cls=value.__class__.__name__)) parts = value.split("@") if len(parts) < 2: raise ValueError("email address is missing an '@' sign") if len(parts) > 2: raise ValueError("email address contains multiple '@' signs") local_part, domain = parts # === Validate and normalize the email address' local part === if not local_part: raise ValueError("expected local part before '@', but found nothing") # RFC 5321 4.5.3.1.1 # We're checking the number of characters here. If the local part # is ASCII-only, then that's the same as bytes (octets). If it's # internationalized, then the UTF-8 encoding may be longer, but # that may not be relevant. We will check the total address length # instead. if len(local_part) > _LOCAL_PART_MAX_LENGTH: raise ValueError( "expected at most 64 characters, " "but local part contains {chars}".format(chars=len(local_part))) if re.match(_DOT_ATOM_TEXT + "\\Z", local_part): # The local part is valid ascii. normalized_local_part = local_part ascii_local_part = local_part else: if not re.match(_DOT_ATOM_TEXT_UTF8 + "\\Z", local_part): # It's not a valid internationalized address either. Report which # characters were not valid. bad_chars = ", ".join( sorted( set(c for c in local_part if not re.match( u"[" + (_ATEXT if not allow_smtputf8 else _ATEXT_UTF8) + u"]", c, )))) raise ValueError( "local part contains invalid characters: {bad_chars!r}".format( bad_chars=bad_chars)) if not allow_smtputf8: raise ValueError("invalid non-ascii characters in local part") # RFC 6532 section 3.1 also says that Unicode NFC normalization should # be applied. normalized_local_part = unicodedata.normalize("NFC", local_part) ascii_local_part = None # === Validate and normalize the email address' domain === if len(domain) == 0: raise ValueError("expected domain name after '@', but found nothing") # Perform UTS-46 normalization, which includes casefolding, NFC # normalization, and converting all label separators (the period/full # stop, fullwidth full stop, ideographic full stop, and halfwidth # ideographic full stop) to basic periods. # It will also raise an exception if there is an invalid character in the # input, such as "⒈" which is invalid because it would expand to include # a period. try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: raise ValueError( "domain name contains invalid characters: {error}".format(error=e)) # Now we can perform basic checks on the use of periods (since equivalent # symbols have been mapped to periods). These checks are needed because the # IDNA library doesn't handle well domains that have empty labels (i.e. # initial dot, trailing dot, or two dots in a row). if domain.endswith("."): raise ValueError("unexpected period at end of domain name") if domain.startswith("."): raise ValueError("unexpected period at start of domain name") if ".." in domain: raise ValueError("unexpected consecutive periods in domain name") # Regardless of whether international characters are actually used, # first convert to IDNA ASCII. For ASCII-only domains, the transformation # does nothing. If internationalized characters are present, the MTA # must either support SMTPUTF8 or the mail client must convert the # domain name to IDNA before submission. # # Unfortunately this step incorrectly 'fixes' domain names with leading # periods by removing them, so we have to check for this above. It also # gives a funky error message ("No input") when there are two periods in a # row, also checked separately above. try: ascii_domain = idna.encode(domain, uts46=False).decode("ascii") except idna.IDNAError as e: if "Domain too long" in str(e): # We can't really be more specific because UTS-46 normalization # means the length check is applied to a string that is different # from the one the user supplied. Also I'm not sure if the length # check applies to the internationalized form, the IDNA ASCII # form, or even both! raise ValueError("domain name is too long") raise ValueError( "domain name contains invalid characters: {error}".format(error=e)) # We may have been given an IDNA ASCII domain to begin with. Check that # the domain actually conforms to IDNA. It could look like IDNA but not be # actual IDNA. For ASCII-only domains, the conversion out of IDNA just # gives the same thing back. # # This gives us the canonical internationalized form of the domain, which # we should use in all error messages. try: normalized_domain = idna.decode(ascii_domain.encode("ascii")) except idna.IDNAError as e: raise ValueError( "domain name is not valid idna: {error}".format(error=e)) # RFC 5321 4.5.3.1.2 # We're checking the number of bytes (octets) here, which can be much # higher than the number of characters in internationalized domains, on # the assumption that the domain may be transmitted without SMTPUTF8 as # IDNA ASCII. This is also checked by idna.encode, so this exception is # never reached. if len(ascii_domain) > _DOMAIN_MAX_LENGTH: raise ValueError( "expected no more than 255 characters after idna encoding, " "but domain expanded to {count}".format(count=len(ascii_domain))) # Check the regular expression. This is probably entirely redundant with # idna.decode, which also checks this format. m = re.match(_DOT_ATOM_TEXT_HOSTNAME + "\\Z", ascii_domain) if not m: raise ValueError("unexpected characters in address domain") # All publicly deliverable addresses have domain named with at least one # period. We also know that all TLDs end with a letter. if "." not in ascii_domain: raise ValueError( "expected a subdomain of a tld, but domain is missing a period") if not re.search(r"[A-Za-z]\Z", ascii_domain): raise ValueError( "expected a subdomain of a tld, but tld does not match pattern") # === Check bulk properties of the email address === normalized_email = normalized_local_part + "@" + normalized_domain if ascii_local_part and ascii_domain: ascii_email = ascii_local_part + "@" + ascii_domain else: ascii_email = None # If the email address has an ASCII representation, then we assume it may # be transmitted in ASCII (we can't assume SMTPUTF8 will be used on all # hops to the destination) and the length limit applies to ASCII # characters (which is the same as octets). The number of characters in # may be many fewer (because IDNA ASCII is verbose) and could be less than # 254 Unicode characters, and of course the number of octets over the # limit may not be the number of characters over the limit, so if the # email address is internationalized, we can't give any simple information # about why the address is too long. # # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not # Unicode characters) is at most 254 octets. If the addres is transmitted # using SMTPUTF8, then the length limit probably applies to the UTF-8 # encoded octets. If the email address has an ASCII form that differs # from its internationalized form, I don't think the internationalized # form can be longer, and so the ASCII form length check would be # sufficient. If there is no ASCII form, then we have to check the UTF-8 # encoding. The UTF-8 encoding could be up to about four times longer than # the number of characters. if ascii_email and len(ascii_email) > _EMAIL_MAX_LENGTH: raise ValueError("email address is too long when isda encoded") else: if len(normalized_email) > _EMAIL_MAX_LENGTH: raise ValueError("email address is too long") if len(normalized_email.encode("utf8")) > _EMAIL_MAX_LENGTH: raise ValueError("email address is too long when utf-8 encoded") if not allow_unnormalized and value != normalized_email: raise ValueError("email address is not normalised")
def validate_email_domain_part(domain): # Empty? if len(domain) == 0: raise EmailSyntaxError("There must be something after the @-sign.") # Perform UTS-46 normalization, which includes casefolding, NFC normalization, # and converting all label separators (the period/full stop, fullwidth full stop, # ideographic full stop, and halfwidth ideographic full stop) to basic periods. # It will also raise an exception if there is an invalid character in the input, # such as "⒈" which is invalid because it would expand to include a period. try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e))) # Now we can perform basic checks on the use of periods (since equivalent # symbols have been mapped to periods). These checks are needed because the # IDNA library doesn't handle well domains that have empty labels (i.e. initial # dot, trailing dot, or two dots in a row). if domain.endswith("."): raise EmailSyntaxError("An email address cannot end with a period.") if domain.startswith("."): raise EmailSyntaxError("An email address cannot have a period immediately after the @-sign.") if ".." in domain: raise EmailSyntaxError("An email address cannot have two periods in a row.") # Regardless of whether international characters are actually used, # first convert to IDNA ASCII. For ASCII-only domains, the transformation # does nothing. If internationalized characters are present, the MTA # must either support SMTPUTF8 or the mail client must convert the # domain name to IDNA before submission. # # Unfortunately this step incorrectly 'fixes' domain names with leading # periods by removing them, so we have to check for this above. It also gives # a funky error message ("No input") when there are two periods in a # row, also checked separately above. try: domain = idna.encode(domain, uts46=False).decode("ascii") except idna.IDNAError as e: raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e))) # We may have been given an IDNA ASCII domain to begin with. Check # that the domain actually conforms to IDNA. It could look like IDNA # but not be actual IDNA. For ASCII-only domains, the conversion out # of IDNA just gives the same thing back. # # This gives us the canonical internationalized form of the domain, # which we should use in all error messages. try: domain_i18n = idna.decode(domain.encode('ascii')) except idna.IDNAError as e: raise EmailSyntaxError("The domain name %s is not valid IDNA (%s)." % (domain, str(e))) # RFC 5321 4.5.3.1.2 if len(domain) > 255: raise EmailSyntaxError("The email address is too long after the @-sign.") # A "dot atom text", per RFC 2822 3.2.4, but using the restricted # characters allowed in a hostname (see ATEXT_HOSTNAME above). DOT_ATOM_TEXT = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*' # Check the regular expression. This is probably entirely redundant # with idna.decode, which also checks this format. m = re.match(DOT_ATOM_TEXT + "$", domain) if not m: raise EmailSyntaxError("The email address contains invalid characters after the @-sign.") # All publicly deliverable addresses have domain named with at least # one period. We also know that all TLDs end with a letter. if "." not in domain: raise EmailSyntaxError("The domain name %s is not valid. It should have a period." % domain_i18n) if not re.search(r"[A-Za-z]$", domain): raise EmailSyntaxError("The domain name %s is not valid. It is not within a valid top-level domain." % domain_i18n) # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 # possibly), as well as the canonical Unicode form of the domain, # which is better for display purposes. This should also take care # of RFC 6532 section 3.1's suggestion to apply Unicode NFC # normalization to addresses. return { "domain": domain, "domain_i18n": domain_i18n, }
def test_std3(self): self.assertEqual(idna.uts46_remap('A_', std3_rules=False), 'a_') self.assertRaises(idna.InvalidCodepoint, idna.uts46_remap, 'A_', std3_rules=True)
def to_bytes(proto, string): return idna.uts46_remap(string).encode("utf-8")