def decode_mail_header(value, default_charset='us-ascii'): """ Decode a header value into a unicode string. """ try: headers = decode_header(value) except email.errors.HeaderParseError: return str_decode(str_encode(value, default_charset, 'replace'), default_charset) else: for index, (text, charset) in enumerate(headers): try: logger.debug( "Mail header no. {index}: {data} encoding {charset}". format(index=index, data=str_decode(text, charset or 'utf-8', 'replace'), charset=charset)) headers[index] = str_decode(text, charset or default_charset, 'replace') except LookupError: # if the charset is unknown, force default headers[index] = str_decode(text, default_charset, 'replace') return ''.join(headers)
def decode_mail_header(value, default_charset='us-ascii'): """ Decode a header value into a unicode string. """ try: headers = decode_header(value) except email.errors.HeaderParseError: return str_decode(str_encode(value, default_charset, 'replace'), default_charset) else: for index, (text, charset) in enumerate(headers): try: headers[index] = str_decode(text, charset or default_charset, 'replace') except LookupError: # if the charset is unknown, force default headers[index] = str_decode(text, default_charset, 'replace') return ''.join(headers)
def decode_param(param): name, v = param.split('=', 1) values = v.split('\n') value_results = [] for value in values: match = re.search(r'=\?((?:\w|-)+)\?(Q|B)\?(.+)\?=', value) if match: encoding, type_, code = match.groups() if type_ == 'Q': value = quopri.decodestring(code) elif type_ == 'B': value = base64.decodestring(code) value = str_encode(value, encoding) value_results.append(value) if value_results: v = ''.join(value_results) return name, v
def decode_param(param): name, v = param.split('=', 1) values = v.split('\n') value_results = [] for value in values: match = re.search(r'=\?((?:\w|-)+)\?([QB])\?(.+)\?=', value) if match: encoding, type_, code = match.groups() if type_ == 'Q': value = quopri.decodestring(code) elif type_ == 'B': value = base64.decodebytes(code.encode()) value = str_encode(value, encoding) value_results.append(value) if value_results: v = ''.join(value_results) logger.debug("Decoded parameter {} - {}".format(name, v)) return name, v
def decode_param(param): name, v = param.split('=', 1) values = v.split('\n') value_results = [] for value in values: match = re.search(r'=\?((?:\w|-)+)\?(Q|B)\?(.+)\?=', value) if match: encoding, type_, code = match.groups() try: if isinstance(code, str): code = code.encode('ascii') except Exception: pass if type_ == 'Q': value = quopri.decodestring(code) elif type_ == 'B': value = base64.decodebytes(code) value = str_encode(value, encoding) value_results.append(value) if value_results: v = ''.join(value_results) return name, v
def decode_mail_header(value, default_charset='us-ascii'): """ Decode a header value into a unicode string. """ try: headers = decode_header(value) except email.errors.HeaderParseError: return str_decode(str_encode(value, default_charset, 'replace'), default_charset) else: for index, (text, charset) in enumerate(headers): logger.debug("Mail header no. {index}: {data} encoding {charset}".format( index=index, data=str_decode(text, charset or 'utf-8', 'replace'), charset=charset)) try: headers[index] = str_decode(text, charset or default_charset, 'replace') except LookupError: # if the charset is unknown, force default headers[index] = str_decode(text, default_charset, 'replace') return ''.join(headers)
def decode_param(param): name, v = param.split('=', 1) values = v.split('\n') value_results = [] for value in values: match = re.search(r'=\?((?:\w|-)+)\?([QB])\?(.+)\?=', value) if match: encoding, type_, code = match.groups() if type_ == 'Q': code = ''.join( ' ' if ord(c) == 160 else c for c in code) # ord(' ') == 160 vs ord(' ') == 32 value = quopri.decodestring(code) elif type_ == 'B': value = base64.decodebytes(code.encode()) # value = str_encode(value, encoding) # 修复 UnicodeError 问题 value = str_encode(value, encoding, 'ignore') value_results.append(value) if value_results: v = ''.join(value_results) logger.debug("Decoded parameter {} - {}".format(name, v)) return name, v
def parse_email(raw_email): if isinstance(raw_email, binary_type): raw_email = str_encode(raw_email, 'utf-8') try: email_message = email.message_from_string(raw_email) except UnicodeEncodeError: email_message = email.message_from_string(raw_email.encode('utf-8')) maintype = email_message.get_content_maintype() parsed_email = {} parsed_email['raw_email'] = raw_email body = { "plain": [], "html": [] } attachments = [] if maintype in ('multipart', 'image'): logger.debug("Multipart message. Will process parts.") for part in email_message.walk(): content_type = part.get_content_type() part_maintype = part.get_content_maintype() content_disposition = part.get('Content-Disposition', None) if content_disposition or not part_maintype == "text": content = part.get_payload(decode=True) else: content = decode_content(part) is_inline = content_disposition is None \ or content_disposition == "inline" if content_type == "text/plain" and is_inline: body['plain'].append(content) elif content_type == "text/html" and is_inline: body['html'].append(content) elif content_disposition: attachment = parse_attachment(part) if attachment: attachments.append(attachment) elif maintype == 'text': payload = decode_content(email_message) body['plain'].append(payload) parsed_email['attachments'] = attachments parsed_email['body'] = body email_dict = dict(email_message.items()) parsed_email['sent_from'] = get_mail_addresses(email_message, 'from') parsed_email['sent_to'] = get_mail_addresses(email_message, 'to') parsed_email['cc'] = get_mail_addresses(email_message, 'cc') parsed_email['bcc'] = get_mail_addresses(email_message, 'bcc') value_headers_keys = ['subject', 'date', 'message-id'] key_value_header_keys = ['received-spf', 'mime-version', 'x-spam-status', 'x-spam-score', 'content-type'] parsed_email['headers'] = [] for key, value in email_dict.items(): if key.lower() in value_headers_keys: valid_key_name = key.lower().replace('-', '_') parsed_email[valid_key_name] = decode_mail_header(value) if key.lower() in key_value_header_keys: parsed_email['headers'].append({'Name': key, 'Value': value}) if parsed_email.get('date'): timetuple = email.utils.parsedate(parsed_email['date']) parsed_date = datetime.fromtimestamp(time.mktime(timetuple)) \ if timetuple else None parsed_email['parsed_date'] = parsed_date logger.info("Downloaded and parsed mail '{}' with {} attachments".format( parsed_email.get('subject'), len(parsed_email.get('attachments')))) return Struct(**parsed_email)
def parse_email(uid, raw_email, policy=None): parsed_email = {'uid': bytes.decode(uid)} # parsed_email = {'uid': uid} if isinstance(raw_email, bytes): raw_email = str_encode(raw_email, 'utf-8', errors='ignore') if policy is not None: email_parse_kwargs = dict(policy=policy) else: email_parse_kwargs = {} try: email_message = email.message_from_string(raw_email, **email_parse_kwargs) except UnicodeEncodeError: email_message = email.message_from_string(raw_email.encode('utf-8'), **email_parse_kwargs) maintype = email_message.get_content_maintype() # parsed_email['raw_email'] = raw_email body = {"plain": [], "html": []} attachments = [] if maintype in ('multipart', 'image'): for part in email_message.walk(): content_type = part.get_content_type() part_maintype = part.get_content_maintype() content_disposition = part.get('Content-Disposition', None) if content_disposition or not part_maintype == "text": content = part.get_payload(decode=True) else: content = decode_content(part) is_inline = content_disposition is None \ or content_disposition.startswith("inline") if content_type == "text/plain" and is_inline: body['plain'].append(re.sub('\\r|\\n', '', str(content))) elif content_type == "text/html" and is_inline: body['html'].append(re.sub('\\r|\\n', '', str(content))) elif content_type == "image/jpeg" and is_inline: # 正文插图 filename = decode_mail_header(str(part.get_param('name'))) if not op.eq(filename, 'None'): filename = filename[:-4] if not os.path.exists(illustrate_path): os.makedirs(illustrate_path) with open(illustrate_path + "/" + filename, "wb") as fw: fw.write(content) fw.close() elif content_disposition: # 附件 attachment = parse_attachment(part) if attachment: attachments.append(attachment) elif maintype == 'text': payload = decode_content(email_message) body['plain'].append(payload) parsed_email['attachments'] = attachments parsed_email['body'] = body email_dict = dict(email_message.items()) parsed_email['sent_from'] = get_mail_addresses(email_message, 'from') parsed_email['sent_to'] = get_mail_addresses(email_message, 'to') parsed_email['cc'] = get_mail_addresses(email_message, 'cc') parsed_email['bcc'] = get_mail_addresses(email_message, 'bcc') value_headers_keys = ['subject', 'date', 'message-id'] key_value_header_keys = [ 'received-spf', 'mime-version', 'x-spam-status', 'x-spam-score', 'content-type' ] parsed_email['headers'] = [] for key, value in email_dict.items(): if key.lower() in value_headers_keys: valid_key_name = key.lower().replace('-', '_') parsed_email[valid_key_name] = decode_mail_header(value) if key.lower() in key_value_header_keys: parsed_email['headers'].append({'Name': key, 'Value': value}) if parsed_email.get('date'): timetuple = email.utils.parsedate(parsed_email['date']) parsed_date = datetime.fromtimestamp( time.mktime(timetuple)) if timetuple else None parsed_email['parsed_date'] = parsed_date.strftime("%Y-%m-%d %H:%M:%S") return Struct(**parsed_email)