def safe_decode_hdr(msg=None, name=None, hdr=None, charset=None): """ This method stubbornly tries to decode header data and convert to Pythonic unicode strings. The strings are guaranteed not to contain tab, newline or carriage return characters. If used with a message object, the header and the MIME charset will be inferred from the message headers. >>> msg = email.message.Message() >>> msg['content-type'] = 'text/plain; charset=utf-8' >>> msg['from'] = 'G\\xc3\\xadsli R \\xc3\\x93la <*****@*****.**>' >>> safe_decode_hdr(msg, 'from') u'G\\xedsli R \\xd3la <*****@*****.**>' The =?...?= MIME header encoding is also recognized and processed. >>> safe_decode_hdr(hdr='=?iso-8859-1?Q?G=EDsli_R_=D3la?=\\r\\n<*****@*****.**>') u'G\\xedsli R \\xd3la <*****@*****.**>' >>> safe_decode_hdr(hdr='"=?utf-8?Q?G=EDsli_R?= =?iso-8859-1?Q?=D3la?="') u'G\\xedsli R \\xd3la' And finally, guesses are made with raw binary data. This process could be improved, it currently only attempts utf-8 and iso-8859-1. >>> safe_decode_hdr(hdr='"G\\xedsli R \\xd3la"\\r\\t<*****@*****.**>') u'"G\\xedsli R \\xd3la" <*****@*****.**>' >>> safe_decode_hdr(hdr='"G\\xc3\\xadsli R \\xc3\\x93la"\\n <*****@*****.**>') u'"G\\xedsli R \\xd3la" <*****@*****.**>' # See https://bugs.python.org/issue1079 # encoded word enclosed in parenthesis (comment syntax) >>> safe_decode_hdr(hdr='[email protected] (=?utf-8?Q?Ren=C3=A9?=)') u'[email protected] ( Ren\\xe9 )' # no space after encoded word >>> safe_decode_hdr(hdr='=?UTF-8?Q?Direction?=<*****@*****.**>') u'Direction <*****@*****.**>' """ if hdr is None: value = msg and msg[name] or '' charset = charset or msg.get_content_charset() or 'utf-8' else: value = hdr charset = charset or 'utf-8' if not isinstance(value, unicode): # Already a str! Oh shit, might be nasty binary data. value = try_decode(value, charset, replace='?') # At this point we know we have a unicode string. Next we try # to very stubbornly decode and discover character sets. if '=?' in value and '?=' in value: try: # decode_header wants an unquoted str (not unicode) value = value.encode('utf-8').replace('"', '') # Decode! pairs = decode_header(value) value = ' '.join([try_decode(t, cs or charset) for t, cs in pairs]) except email.errors.HeaderParseError: pass # Finally, return the unicode data, with white-space normalized return value.replace('\r', ' ').replace('\t', ' ').replace('\n', ' ')
def test_decode_header_no_encoding(self): res = decode_header("olmsted") self.assertEqual(res, [('olmsted', None)])