def decodeMessageAsString(msg): """ This helper method takes Message object or string and returns string which does not contain base64 encoded parts Returns message without any encoding in parts """ if isinstance(msg, str): msg = Parser().parsestr(msg) new = deepcopy(msg) # From is utf8 encoded: '=?utf-8?q?Site_Administrator_=3C=3E?=' new.replace_header('From', decode_header(new['From'])[0][0]) new.replace_header('Subject', decode_header(new['Subject'])[0][0]) charset = Charset('utf-8') charset.header_encoding = SHORTEST charset.body_encoding = QP charset.output_charset = 'utf-8' for part in new.walk(): if part.get_content_maintype() == "multipart": continue decoded = part.get_payload(decode=1) del part['Content-Transfer-Encoding'] part.set_payload(decoded, charset) return new.as_string()
def parseRequest(msg): # check mandatory header fields if not msg.has_key("Subject"): return None if not msg.has_key("From"): return None # parse the first subject header only s, enc = decode_header(msg.get_all("Subject")[0])[0] if not enc: s = unicode(s) else: s = unicode(s, enc) m = p.match(s) if not m: return None # normalize request gd = m.groupdict() delivery_type = gd["delivery_type"].upper() if delivery_type == "CANDC": delivery_type = "CandC" locale = gd["locale"].lower() locale = locale[:3] + locale[3:].upper() if gd.has_key("audio") and gd["audio"]: target = delivery_type + "." + locale + "+audio" else: target = delivery_type + "." + locale # normalize "From" entries fromlist = [] for entry in msg.get_all("From", []): uentry = u"" for partition, enc in decode_header(entry): if not enc: uentry += unicode(partition) else: uentry += unicode(partition, enc) fromlist.append(uentry) # normalize "Cc" entries cclist = [] for entry in msg.get_all("Cc", []): uentry = u"" for partition, enc in decode_header(entry): if not enc: uentry += unicode(partition) else: uentry += unicode(partition, enc) cclist.append(uentry) return (target, fromlist, cclist)
def decode_subject(subject): if subject[0:2] == '=?' and subject[-2:] == '?=': subject = u''.join( unicode(s, c or 'us-ascii') for s, c in decode_header(subject)) else: subject = unicode(collapse_rfc2231_value(subject)) return subject
def handle_attachment(message, content, related=False): # r = '' # if related: # r = '(r)' filename, encoding = decode_header(content.get_filename())[0] if encoding: filename = filename.decode(encoding, errors='replace') #if not related: # print "saving attachment [%s] of type %s from message %d %s" % (filename, content.get_content_type(), message.id, r) a = Attachment() a.filename = filename # TODO need to parse weird strings from this if not a.filename: a.filename = str(uuid.uuid4()) a.content_type = content.get_content_type() a.stored_location = os.path.join(files_dir, str(message.id), get_valid_filename(a.filename)) # probably want to fix this too a.mime_related = related # load the file file_content = content.get_payload(decode=1) a.file_md5 = hashlib.md5(file_content).hexdigest() # again, probably a better way to do this than all in memory # actually write it do disk - should wrap this in a try except too if not os.path.exists(os.path.join(files_dir, str(message.id))): os.makedirs(os.path.join(files_dir, str(message.id))) with open(a.stored_location, 'wb') as fp: fp.write(file_content) a.message = message a.save()
def test_japanese_codecs(self): eq = self.ndiffAssertEqual j = Charset("euc-jp") g = Charset("iso-8859-1") h = Header("Hello World!") jhello = '\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc\xa5\xeb\xa5\xc9\xa1\xaa' ghello = 'Gr\xfc\xdf Gott!' h.append(jhello, j) h.append(ghello, g) # BAW: This used to -- and maybe should -- fold the two iso-8859-1 # chunks into a single encoded word. However it doesn't violate the # standard to have them as two encoded chunks and maybe it's # reasonable <wink> for each .append() call to result in a separate # encoded word. eq( h.encode(), """\ Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?= =?iso-8859-1?q?Gr=FC=DF?= =?iso-8859-1?q?_Gott!?=""") eq(decode_header(h.encode()), [('Hello World!', None), ('\x1b$B%O%m!<%o!<%k%I!*\x1b(B', 'iso-2022-jp'), ('Gr\xfc\xdf Gott!', 'iso-8859-1')]) long = 'test-ja \xa4\xd8\xc5\xea\xb9\xc6\xa4\xb5\xa4\xec\xa4\xbf\xa5\xe1\xa1\xbc\xa5\xeb\xa4\xcf\xbb\xca\xb2\xf1\xbc\xd4\xa4\xce\xbe\xb5\xc7\xa7\xa4\xf2\xc2\xd4\xa4\xc3\xa4\xc6\xa4\xa4\xa4\xde\xa4\xb9' h = Header(long, j, header_name="Subject") # test a very long header enc = h.encode() # TK: splitting point may differ by codec design and/or Header encoding eq( enc, """\ =?iso-2022-jp?b?dGVzdC1qYSAbJEIkWEVqOUYkNSRsJD8lYSE8JWskTztKGyhC?= =?iso-2022-jp?b?GyRCMnE8VCROPjVHJyRyQlQkQyRGJCQkXiQ5GyhC?=""") # TK: full decode comparison eq(h.__unicode__().encode('euc-jp'), long)
def process_incoming_mail(self, msg): to = self.get_email_address_ident(msg, 'To') sender = msg.get('From') reply_to = self.get_email_address_ident(msg, 'In-Reply-To') title = msg.get('Subject') if title: new_title = u'' for part in decode_header(title): if part[1]: new_title += unicode(part[0], part[1]) else: new_title += unicode(part[0]) title = new_title content = u'' for part in msg.walk(): if part.get_content_type() == 'text/plain': s = part.get_payload(decode=True) charsets = part.get_charsets() + msg.get_charsets() for charset in charsets: try: if charset is not None: content += unicode(s, charset) else: content += unicode(s) except UnicodeError, e: self.logger.warning('Unicode error: %s' % e) continue except Exception, e: self.logger.exception(e) continue else: break
def getDecodedHeaders(msg, cset='utf-8'): """Returns a unicode containing all the headers of msg, unfolded and RFC 2047 decoded, normalized and separated by new lines. """ headers = u'' for h, v in msg.items(): uvalue = u'' try: v = decode_header(re.sub('\n\s', ' ', v)) except HeaderParseError: v = [(v, 'us-ascii')] for frag, cs in v: if not cs: cs = 'us-ascii' try: uvalue += unicode(frag, cs, 'replace') except LookupError: # The encoding charset is unknown. At this point, frag # has been QP or base64 decoded into a byte string whose # charset we don't know how to handle. We will try to # unicode it as iso-8859-1 which may result in a garbled # mess, but we have to do something. uvalue += unicode(frag, 'iso-8859-1', 'replace') uhdr = h.decode('us-ascii', 'replace') headers += u'%s: %s\n' % (h, normalize(mm_cfg.NORMALIZE_FORM, uvalue)) return headers
def add_sender(self, message): def email_location(): recieved = message.get_all('Original-Received') ips = [IP.findall(h) for h in recieved] ips = [ ip[0] for ip in ips if ip and not ip[0].startswith("10.") and not ip[0].startswith("192.168") ] likely = ips[-1] try: logger.info("geocoder: Getting location for %s" % (likely)) url = "http://freegeoip.net/json/%s" % likely logger.debug("geocoder: Fetching %s" % (url)) loc = json.loads(urllib2.urlopen(url).read()) ll = float(loc['latitude']), float(loc['longitude']) if any(ll): return ll, 0 except: pass users = getUtility(IUserDatabase) from_ = list(email.utils.parseaddr(message.get("From"))) # Remove quoted printable from_[0] = decode_header(from_[0])[0] encoding = from_[0][1] if encoding is None: encoding = "utf-8" from_[0] = from_[0][0].decode(encoding) users.add_user(User(from_[0], from_[1], location_func=email_location))
def _get_header(str): '''Get the full text of a header and remove newlines.''' list = decode_header(str) retString = '' for string, charset in list: retString += string.replace("\n", '').replace("\r", '') return retString
def get_header_content(name='', str_encoded=''): message = Parser().parse(open(sys.argv[1])) for line in NEWLINE.split(message.get(name)): decoded_headers = decode_header(line) for parts in decoded_headers: str_encoded = str_encoded + parts[0] return re.sub('(\r\n|\s|\t){2,}', ' ', str_encoded)
def parse_header(val): """Decode headers gratuitously encoded to hide the content. """ try: h = decode_header(val) if not len(h) or (not h[0][1] and len(h) == 1): return val u = [] for s,enc in h: if enc: try: u.append(unicode(s,enc,'replace')) except LookupError: u.append(unicode(s)) else: u.append(unicode(s)) u = ''.join(u) for enc in ('us-ascii','iso-8859-1','utf8'): try: return u.encode(enc) except UnicodeError: continue except UnicodeDecodeError: pass except LookupError: pass except ValueError: pass except email.Errors.HeaderParseError: pass return val
def __decode_header(header): """Decode a qp-encoded e-mail header as per rfc2047""" try: words_enc = decode_header(header) hobj = make_header(words_enc) except Exception, ex: raise CmdException, "header decoding error: %s" % str(ex)
def mailread(src): """生メールから件名,本文,添付ファイル(画像)を取り出す """ # Messageオブジェクトを作る m = email.message_from_string(src) # ヘッダをデコード subj = decode_header(m["Subject"]) # ヘッダを表示 try: print unicode(make_header(subj)) except: pass print "-" * 70 # 全パートをスキャン for part in m.walk(): type = part.get_content_maintype() # maintypeを得る if type and type.find("image") != -1: # 画像の添付が見つかったら,ファイルに保存 filename = part.get_filename("notitle.img") f = open(filename, "wb") f.write(part.get_payload(decode=True)) f.close() if type and type.find("text") != -1: # テキストは表示 enc = part.get_charsets()[0] or "us-ascii" print part.get_payload().decode(enc, "ignore")
def main(args): try: opts, args = getopt.getopt(args, "hd:S:H:f:", ["help", "database=", "spamfile=", "hamfile=", "feature="]) except getopt.GetoptError as msg: usage(msg) return 1 charset = locale.getdefaultlocale()[1] if not charset: charset = 'us-ascii' mapfile = spamfile = hamfile = None features = set() for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 elif opt in ("-d", "--database"): mapfile = arg elif opt in ("-H", "--hamfile"): hamfile = arg elif opt in ("-S", "--spamfile"): spamfile = arg elif opt in ("-f", "--feature"): features.add(str(arg, charset)) if hamfile is None and spamfile is None: usage("At least one of -S or -H are required") return 1 if mapfile is None: usage("'-d mapfile' is required") return 1 try: mapd = pickle_read(mapfile) except IOError: usage("Mapfile %s does not exist" % mapfile) return 1 if not features and not args: usage("Require at least one feature (-f) arg or one message file") return 1 if not features: for f in args: for msg in getmbox(f): evidence = msg.get("X-Spambayes-Evidence", "") evidence = re.sub(r"\s+", " ", evidence) l = [e.rsplit(": ", 1)[0] for e in evidence.split("; ")[2:]] for s in l: try: s = make_header(decode_header(s)).__unicode__() except: s = str(s, 'us-ascii', 'replace') features.add(s) if not features: usage("No X-Spambayes-Evidence headers found") return 1 if spamfile is not None: spamfile = file(spamfile, "w") if hamfile is not None: hamfile = file(hamfile, "w") extractmessages(features, mapd, hamfile, spamfile)
def parse_header_field(self, field): if field is None: return None # preprocess head field # see http://stackoverflow.com/questions/7331351/python-email-header-decoding-utf-8 field = re.sub(r"(=\?.*\?=)(?!$)", r"\1 ", field) decodefrag = decode_header(field) fragments = [] for s, enc in decodefrag: if enc: try: s = unicode(s, enc, errors='replace') except UnicodeDecodeError: # desperate move here try: s = s.decode("latin1") except: pass else: try: s = s.decode("latin1") except: s = unicode(s, errors='ignore') fragments.append(s) field = u' '.join(fragments) return field.replace('\n\t', " ").replace('\n', '').replace('\r', '')
def add_sender(self, message): def email_location(): recieved = message.get_all('Original-Received') ips = [IP.findall(h) for h in recieved] ips = [ip[0] for ip in ips if ip and not ip[0].startswith("10.") and not ip[0].startswith("192.168")] likely = ips[-1] try: logger.info("geocoder: Getting location for %s" % (likely)) url = "http://freegeoip.net/json/%s"%likely logger.debug("geocoder: Fetching %s" % (url)) loc = json.loads(urllib2.urlopen(url).read()) ll = float(loc['latitude']), float(loc['longitude']) if any(ll): return ll, 0 except: pass users = getUtility(IUserDatabase) from_ = list(email.utils.parseaddr(message.get("From"))) # Remove quoted printable from_[0] = decode_header(from_[0])[0] encoding = from_[0][1] if encoding is None: encoding = "utf-8" from_[0] = from_[0][0].decode(encoding) users.add_user(User(from_[0], from_[1], location_func=email_location))
def mailread(src): """生メールから件名、本文、添付ファイル(画像)を取り出す """ # Messageオブジェクトを作る m = email.message_from_string(src) # ヘッダをデコード subj = decode_header(m["Subject"]) # ヘッダを表示 try: print unicode(make_header(subj)) except: pass; print "-" * 70 # 全パートをスキャン for part in m.walk(): type = part.get_content_maintype() # maintypeを得る if type and type.find("image") != -1: # 画像の添付が見つかったら、ファイルに保存 filename = part.get_filename("notitle.img") f = open(filename, "wb") f.write(part.get_payload(decode = True)) f.close() elif type and type.find("text") != -1: # テキストは表示 enc ~ part.get_charsets()[0] or "us-ascii" print part.get_payload().decode(enc, "ignore")
def finishHeader(self): if self.prevheader is not None: prevheader = self.prevheader.lower() decodedValueList = [] try: parts = decode_header(self.prevvalue) for maybeUncoded in parts: if isinstance(maybeUncoded, unicode): decodedValueList.append(maybeUncoded) else: uncoded, encoding = maybeUncoded if encoding is None: encoding = 'ascii' decodedValueList.append(_safelyDecode(uncoded, encoding)) except ValueError: # XXX where is this ValueError coming from? # -glyph decodedValue = self.prevvalue.decode('ascii', 'replace') else: decodedValue = u''.join(decodedValueList) if prevheader in self._normalizeHeaders: values = decodedValue.split(self._normalizeHeaders[prevheader]) for v in values: self.part.addHeader(prevheader, v) else: self.part.addHeader(self.prevheader, decodedValue) self.prevheader = self.prevvalue = None
def getContentInformation(self): """ Returns the content information from the header information. This is used by the metadata discovery system. Header information is converted in UTF-8 since this is the standard way of representing strings in ERP5. """ result = {} for (name, value) in self._getMessage().items(): try: decoded_header = decode_header(value) except HeaderParseError, error_message: decoded_header = () LOG('EmailDocument.getContentInformation', INFO, 'Failed to decode %s header of %s with error: %s' % (name, self.getPath(), error_message)) for text, encoding in decoded_header: try: if encoding is not None: text = text.decode(encoding).encode('utf-8') else: text = text.decode().encode('utf-8') except (UnicodeDecodeError, LookupError), error_message: encoding = guessEncodingFromText(text, content_type='text/plain') if encoding is not None: try: text = text.decode(encoding).encode('utf-8') except (UnicodeDecodeError, LookupError), error_message: text = repr(text)[1:-1] else: text = repr(text)[1:-1]
def finishHeader(self): if self.prevheader is not None: prevheader = self.prevheader.lower() decodedValueList = [] try: parts = decode_header(self.prevvalue) for maybeUncoded in parts: if isinstance(maybeUncoded, unicode): decodedValueList.append(maybeUncoded) else: uncoded, encoding = maybeUncoded if encoding is None: encoding = 'ascii' decodedValueList.append( _safelyDecode(uncoded, encoding)) except ValueError: # XXX where is this ValueError coming from? # -glyph decodedValue = self.prevvalue.decode('ascii', 'replace') else: decodedValue = u''.join(decodedValueList) if prevheader in self._normalizeHeaders: values = decodedValue.split(self._normalizeHeaders[prevheader]) for v in values: self.part.addHeader(prevheader, v) else: self.part.addHeader(self.prevheader, decodedValue) self.prevheader = self.prevvalue = None
def test_japanese_codecs(self): eq = self.ndiffAssertEqual j = Charset("euc-jp") g = Charset("iso-8859-1") h = Header("Hello World!") jhello = '\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc\xa5\xeb\xa5\xc9\xa1\xaa' ghello = 'Gr\xfc\xdf Gott!' h.append(jhello, j) h.append(ghello, g) # BAW: This used to -- and maybe should -- fold the two iso-8859-1 # chunks into a single encoded word. However it doesn't violate the # standard to have them as two encoded chunks and maybe it's # reasonable <wink> for each .append() call to result in a separate # encoded word. eq(h.encode(), """\ Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?= =?iso-8859-1?q?Gr=FC=DF?= =?iso-8859-1?q?_Gott!?=""") eq(decode_header(h.encode()), [('Hello World!', None), ('\x1b$B%O%m!<%o!<%k%I!*\x1b(B', 'iso-2022-jp'), ('Gr\xfc\xdf Gott!', 'iso-8859-1')]) long = 'test-ja \xa4\xd8\xc5\xea\xb9\xc6\xa4\xb5\xa4\xec\xa4\xbf\xa5\xe1\xa1\xbc\xa5\xeb\xa4\xcf\xbb\xca\xb2\xf1\xbc\xd4\xa4\xce\xbe\xb5\xc7\xa7\xa4\xf2\xc2\xd4\xa4\xc3\xa4\xc6\xa4\xa4\xa4\xde\xa4\xb9' h = Header(long, j, header_name="Subject") # test a very long header enc = h.encode() # TK: splitting point may differ by codec design and/or Header encoding eq(enc , """\ =?iso-2022-jp?b?dGVzdC1qYSAbJEIkWEVqOUYkNSRsJD8lYSE8JWskTztKGyhC?= =?iso-2022-jp?b?GyRCMnE8VCROPjVHJyRyQlQkQyRGJCQkXiQ5GyhC?=""") # TK: full decode comparison eq(h.__unicode__().encode('euc-jp'), long)
def getFrom(self): buf = parseaddr(self._msg.get('from', '')) header = decode_header(buf[0]) data = ''.join([ to_unicode(s, enc) for s, enc in header if self.codecs_lookup(enc) ]) return (to_entities_quote(data), buf[1])
def getDecodedHeaders(msg, cset='utf-8'): """Returns a string containing all the headers of msg, unfolded and RFC 2047 decoded and encoded in cset. """ headers = '' for h, v in msg.items(): uvalue = u'' try: v = decode_header(re.sub('\n\s', ' ', v)) except HeaderParseError: v = [(v, 'us-ascii')] for frag, cs in v: if not cs: cs = 'us-ascii' try: uvalue += unicode(frag, cs, 'replace') except LookupError: # The encoding charset is unknown. At this point, frag # has been QP or base64 decoded into a byte string whose # charset we don't know how to handle. We will try to # unicode it as iso-8859-1 which may result in a garbled # mess, but we have to do something. uvalue += unicode(frag, 'iso-8859-1', 'replace') headers += '%s: %s\n' % (h, uvalue.encode(cset, 'replace')) return headers
def email_parse(self, content): p = Parser() msgobj = p.parsestr(content) if msgobj['Subject'] is not None: decodefrag = decode_header(msgobj['Subject']) subj_fragments = [] for s , enc in decodefrag: if enc: s = unicode(s , enc).encode('utf8','replace') subj_fragments.append(s) subject = ''.join(subj_fragments) else: subject = None attachments = [] body_text = "" body_html = "" for part in msgobj.walk(): attachment = self.email_parse_attachment(part) if attachment: attachments.append(attachment) elif part.get_content_type() == "text/plain": body_text += unicode(part.get_payload(decode=True),part.get_content_charset(),'replace').encode('utf8','replace') elif part.get_content_type() == "text/html": body_html += unicode(part.get_payload(decode=True),part.get_content_charset(),'replace').encode('utf8','replace') return { 'subject': subject, 'body_text': body_text, 'body_html': body_html, 'from': parseaddr(msgobj.get('From'))[1], 'to': parseaddr(msgobj.get('To'))[1], 'attachments': attachments }
def initializeObject(context, fields, message, defaultCharset='utf-8'): contentType = message.get_content_type() charset = message.get_charset() if charset is None: charset = message.get_param('charset') if charset is not None: charset = str(charset) else: charset = defaultCharset headerFields = {} primary = [] for name, field in fields: if IPrimaryField.providedBy(field): primary.append((name, field)) else: headerFields.setdefault(name.lower(), []).append(field) # Demarshal each header for name, value in message.items(): name = name.lower() fieldset = headerFields.get(name, None) if fieldset is None or len(fieldset) == 0: LOG.debug("No matching field found for header %s" % name) continue field = fieldset.pop(0) marshaler = queryMultiAdapter((context, field,), IFieldMarshaler) if marshaler is None: LOG.debug("No marshaler found for field %s of %s" % (name, repr(context))) continue headerValue, headerCharset = decode_header(value)[0] if headerCharset is None: headerCharset = charset # MIME messages always use CRLF. For headers, we're probably safer with # \n headerValue = headerValue.replace('\r\n', '\n') try: marshaler.demarshal( headerValue, message=message, charset=headerCharset, contentType=contentType, primary=False ) except ValueError, e: # interface allows demarshal() to raise ValueError to indicate # marshalling failed LOG.debug("Demarshalling of %s for %s failed: %s" % (name, repr(context), str(e))) continue
def getSubject(self): buf = self._msg.get('subject', '') header = decode_header(buf) data = ''.join([ to_unicode(s, enc) for s, enc in header if self.codecs_lookup(enc) ]) return to_entities_quote(data)
def __call__(self, request): headers, msg = request partners = self.get_partners(headers, msg) subject = u'' for string, charset in decode_header(msg['Subject']): if charset: subject += string.decode(charset) else: subject += unicode(string) if partners: self.save_mail(msg, subject, partners) else: warning = MIMEText((warn_msg % (subject, )).encode('utf-8'), 'plain', 'utf-8') warning['Subject'] = 'Message de PengERP' warning['From'] = '*****@*****.**' warning['To'] = msg['From'] s = smtplib.SMTP() s.connect() s.sendmail('*****@*****.**', self.email_re.findall(msg['From']), warning.as_string()) s.close() if msg.is_multipart(): for message in [ m for m in msg.get_payload() if m.get_content_type() == 'message/rfc822' ]: self((headers, message.get_payload()[0]))
def _decodeHeaders(msg, defaultCharacterSet='ascii'): """Decode message into (header, value) pairs.""" # Get all mail headers. headers = msg.keys() # List of {header: value}. Sample: # [ # {'From': '*****@*****.**', 'To': '*****@*****.**',}, # ] headers_values = [] for h in headers: # Skip non-exist headers. if not h in msg.keys(): continue try: # Decode header value to list of (decoded_string, charset) pairs. # Convert into unicode. header_value = u' '.join([ unicode(text, charset or defaultCharacterSet) for text, charset in decode_header(msg[h]) ]) headers_values += [{h: header_value}] except Exception, e: pass
def ClearSubjectHeader(strSubject): """Returns mailSubject without list name""" strResult = strSubject for strPrefix in "UKCoach", "ec-l", "eurocoach-list": strResult = strResult.replace("[%s]" % strPrefix, "") (strResult, strEncoding) = decode_header(strResult)[0] return strResult
def _get_header(str): '''Get the full text of a header and remove newlines.''' list = decode_header(str) retString = '' for string, charset in list: retString += string.replace("\n", '') return retString
def process_message(self, remoteHosts, mailfrom, rcpttos, data): if not BlackHoleSmtp.real_address_list: BlackHoleSmtp.real_address_list = set([ a.email for a in RealAddress.objects.filter(suspend__exact=False) ]) recipients = ','.join(rcpttos) real_addresses = BlackHoleSmtp.real_address_list & set( rcpttos) #TODO check rcpttos data for real_address in real_addresses: smtpd.PureProxy.process_message(self, remoteHosts, mailfrom, [real_address], data) #check real_address data #smtpd.PureProxy.process_message(self, remoteHosts, mailfrom, rcpttos, data) debug('send email to: %s', (real_address, )) msg = message_from_string(data) log = LoggedMail(from_address=mailfrom, to_address=recipients) charset = 'latin1' log.raw_header = '\n'.join( ['%s:%s' % (key, msg.get(key)) for key in msg.keys()]) header = '' for key in msg.keys(): value, type = decode_header(msg.get(key))[0] if not type: value = unicode(value) else: value = unicode(value, type) charset = type header += '%s:%s(mime:%s)\n' % (key, value, type) if key == 'Subject': log.subject = value log.header = header log.charset = charset log.save() file_name_base = 'msg_%07d_' % log.id for part in msg.walk(): if part.get_content_maintype() == 'multipart': continue if part.get_content_maintype() == 'text': log.raw_body += part.get_payload(decode=0) log.body += unicode(part.get_payload(decode=1), charset) log.save() continue file_name = part.get_filename() if not file_name: ext = ".bin" if hasattr(part, 'get_type'): ext = mimetypes.guess_extension(part.get_type()) file_name = 'part-%03d%s' % (file_count, ext) at = log.attatchment_set.create(origin_name=file_name) file_name = file_name_base + file_name file = part.get_payload(decode=1) f = open(os.path.join(ATTACH_DIR, file_name), 'w') f.write(file) f.close() at.file = u'attach/%s' % file_name at.save()
def initializeObject(context, fields, message, defaultCharset='utf-8'): contentType = message.get_content_type() charset = message.get_charset() if charset is None: charset = message.get_param('charset') if charset is not None: charset = str(charset) else: charset = defaultCharset headerFields = {} primary = [] for name, field in fields: if IPrimaryField.providedBy(field): primary.append((name, field)) else: headerFields.setdefault(name.lower(), []).append(field) # Demarshal each header for name, value in message.items(): name = name.lower() fieldset = headerFields.get(name, None) if fieldset is None or len(fieldset) == 0: LOG.debug("No matching field found for header %s" % name) continue field = fieldset.pop(0) marshaler = queryMultiAdapter(( context, field, ), IFieldMarshaler) if marshaler is None: LOG.debug("No marshaler found for field %s of %s" % (name, repr(context))) continue headerValue, headerCharset = decode_header(value)[0] if headerCharset is None: headerCharset = charset # MIME messages always use CRLF. For headers, we're probably safer with \n headerValue = headerValue.replace('\r\n', '\n') try: marshaler.demarshal(headerValue, message=message, charset=headerCharset, contentType=contentType, primary=False) except ValueError, e: # interface allows demarshal() to raise ValueError to indicate marshalling failed LOG.debug("Demarshalling of %s for %s failed: %s" % (name, repr(context), str(e))) continue
def parse(content): p = EmailParser() msgobj = p.parsestr(content) if msgobj['Subject'] is not None: decodefrag = decode_header(msgobj['Subject']) subj_fragments = [] for s, enc in decodefrag: if enc: s = unicode(s , enc).encode('utf8', 'replace') subj_fragments.append(s) subject = ''.join(subj_fragments) else: subject = None attachments = [] body = None html = None images = [] images_content_type = [ "image/jpg", "image/jpeg", "image/png", "image/tiff" "application/pdf", ] for part in msgobj.walk(): print part.get_content_type() attachment = parse_attachment(part) if attachment: attachments.append(attachment) elif part.get_content_type() == "text/plain": if body is None: body = "" body += unicode( part.get_payload(decode=True), part.get_content_charset(), 'replace' ).encode('utf8', 'replace') elif part.get_content_type() == "text/html": if html is None: html = "" html += unicode( part.get_payload(decode=True), part.get_content_charset(), 'replace' ).encode('utf8', 'replace') elif part.get_content_type() in images_content_type: images.append(StringIO(part.get_payload(decode=True))) return { 'subject': subject, 'body': body, 'html': html, 'from': parseaddr(msgobj.get('From'))[1], 'to': parseaddr(msgobj.get('To'))[1], 'attachments': attachments, 'images': images, }
def getCC(self): res = [] buf = getaddresses(self._msg.get_all('cc', '')) for i in buf: header = decode_header(i[0]) data = ''.join([to_unicode(s, enc) for s, enc in header if self.codecs_lookup(enc)]) res.append((to_entities_quote(data), i[1])) return res
def decode_QP(string): parts = [] for decoded, charset in decode_header(string): if charset is None: charset = 'iso-8859-15' parts.append(str(decoded, charset, 'replace')) return u' '.join(parts)
def send(self, from_, to, message): print '*TestingMailDelivery sending*:' print 'From:', decode_header(from_)[0][0] print 'To:', ', '.join(to) print 'Message follows:' decoded = decodeMessageAsString(message) print decoded self.sent.append(decoded)
def get_header(self, header_text, default="ascii"): "Decode and return the header" if not header_text: return header_text sections = decode_header(header_text) return ' '.join(section.decode(enc or default, 'replace') for section, enc in sections)
def parse(content): """ parse email """ p = EmailParser() #check content is a file or text #if content is path... #msgobj = p.parse(content) msgobj = p.parsestr(content) if msgobj['Subject'] is not None: decodefrag = decode_header(msgobj['Subject']) subj_fragments = [] for s , enc in decodefrag: if enc: s = unicode(s , enc).encode('utf8','replace') subj_fragments.append(s) subject = ''.join(subj_fragments) else: subject = None attachments = [] body = None html = None for part in msgobj.walk(): attachment = parse_attachment(part) if attachment: attachments.append(attachment) elif part.get_content_type() == "text/plain": if body is None: body = "" if part.get_content_charset: body += part.get_payload(decode=True) else: body += unicode( part.get_payload(decode=True), part.get_content_charset(), 'replace' ).encode('utf8','replace') elif part.get_content_type() == "text/html": if html is None: html = "" html += unicode( part.get_payload(decode=True), part.get_content_charset(), 'replace' ).encode('utf8','replace') return { 'subject' : subject, 'body' : body, 'html' : html, 'from' : parseaddr(msgobj.get('From'))[1], 'to' : parseaddr(msgobj.get('To'))[1], 'date' : parse_date(msgobj.get('Date')), 'attachments': attachments, }
def decode_helper(self, headerpart): retur = '' headerbits = decode_header(headerpart) for item in headerbits: header, c = item if c: header = header.decode(c) retur += header return retur.strip().encode('UTF-8', 'replace')
def decode_TEXT(value): """Decode RFC-2047 TEXT (e.g. "=?utf-8?q?f=C3=BCr?=" -> u"f\xfcr").""" atoms = decode_header(value) decodedvalue = "" for atom, charset in atoms: if charset is not None: atom = atom.decode(charset) decodedvalue += atom return decodedvalue
def decode_email(self, file): # Prepare result theMail = { 'attachment_list': [], 'body': '', # Place all the email header in the headers dictionary in theMail 'headers': {} } # Get Message msg = email.message_from_string(file) # Back up original file theMail['__original__'] = file # Recode headers to UTF-8 if needed for key, value in msg.items(): decoded_value_list = decode_header(value) unicode_value = make_header(decoded_value_list) new_value = unicode_value.__unicode__().encode('utf-8') theMail['headers'][key.lower()] = new_value # Filter mail addresses for header in ('resent-to', 'resent-from', 'resent-cc', 'resent-sender', 'to', 'from', 'cc', 'sender', 'reply-to'): header_field = theMail['headers'].get(header) if header_field: theMail['headers'][header] = parseaddr(header_field)[1] # Get attachments body_found = 0 for part in msg.walk(): content_type = part.get_content_type() file_name = part.get_filename() # multipart/* are just containers # XXX Check if data is None ? if content_type.startswith('multipart'): continue # message/rfc822 contains attached email message # next 'part' will be the message itself # so we ignore this one to avoid doubling elif content_type == 'message/rfc822': continue elif content_type in ("text/plain", "text/html"): charset = part.get_content_charset() payload = part.get_payload(decode=True) #LOG('CMFMailIn -> ',0,'charset: %s, payload: %s' % (charset,payload)) if charset: payload = unicode(payload, charset).encode('utf-8') if body_found: # Keep the content type theMail['attachment_list'].append((file_name, content_type, payload)) else: theMail['body'] = payload body_found = 1 else: payload = part.get_payload(decode=True) # Keep the content type theMail['attachment_list'].append((file_name, content_type, payload)) return theMail
def decode_TEXT(value): r"""Decode :rfc:`2047` TEXT (e.g. "=?utf-8?q?f=C3=BCr?=" -> u"f\xfcr").""" from email.Header import decode_header atoms = decode_header(value) decodedvalue = "" for atom, charset in atoms: if charset is not None: atom = atom.decode(charset) decodedvalue += atom return decodedvalue
def decoded_header(self,msgrep,header): if msgrep[header] is None: return None decodefrag = decode_header(msgrep[header]) fragments = [] for s , enc in decodefrag: if enc: s = unicode(s , enc).encode('utf8','replace') fragments.append(s) return ''.join(fragments)
def decoded_header(self,msgrep,header): if msgrep[header] is None: return None decodefrag = decode_header(msgrep[header]) fragments = [] for s, enc in decodefrag: if enc: s = str(s).decode(enc,'replace').encode('utf8','replace') fragments.append(s) return ''.join(fragments)
def decode_TEXT(value): from email.Header import decode_header atoms = decode_header(value) decodedvalue = '' for atom, charset in atoms: if charset is not None: atom = atom.decode(charset) decodedvalue += atom return decodedvalue
def __init__(self, context, message): # -> none """Extract the bits of interest from an RFC2822 message string. context should be a wiki page. This perhaps should do the isJunk test up front to avoid unnecessary resource usage. """ DEBUG('mailin.py processing incoming message:\n%s' % message) self.context = context self.original = message self.msg = email.message_from_string(self.original) self.date = self.msg['Date'] # flatten a multi-line subject into one line s = re.sub('\n', '', self.msg.get('Subject', '')) # convert the possibly RFC2047-encoded subject to unicode. # Only the first encoded part is used if there is more than one. # misencoded subjects are ignored. (s, enc) = decode_header(s)[0] try: self.subject = tounicode(s, enc or 'ascii') except UnicodeDecodeError: self.subject = '' self.realSubject = re.sub(r'.*?\[.*?\] ?(.*)', r'\1', self.subject) self.messageid = self.msg.get('Message-id', '') self.inreplyto = self.msg.get('In-reply-to', '') self.From = self.msg.get('From') self.FromRealName = parseaddr(self.From)[0] self.FromEmail = parseaddr(self.From)[1] self.FromUserName = (self.FromRealName or re.sub(r'@.*$', r'', self.FromEmail)) self.sender = self.msg.get('Sender') self.senderEmail = (self.sender and parseaddr(self.sender)[1]) or None tos = self.msg.get_all('to', []) ccs = self.msg.get_all('cc', []) resent_tos = self.msg.get_all('resent-to', []) resent_ccs = self.msg.get_all('resent-cc', []) self.recipients = getaddresses(tos + ccs + resent_tos + resent_ccs) # mailing list support # XXX x-beenthere is mailman-specific - need to support ezmlm & others here #self.xbeenthere = (self.msg.get('X-BeenThere') or # re.search(r'[^\s<]+@[^\s>]+',self.msg.get('Delivered-To')).group()) # ..Type Error - configured ezmlm to provide beenthere instead (?) self.xbeenthere = self.msg.get('X-BeenThere') # the mailin body will be the message's first text/plain part # (or a null string if there is none or it's misencoded) try: firstplaintextpart = typed_subpart_iterator( self.msg, 'text', 'plain').next() # as I understand it: # first decoding, from the content-transfer-encoding, eg quoted-printabe payload = firstplaintextpart.get_payload(decode=1) # second decoding, from utf8 or whatever to unicode charset = self.msg.get_content_charset('ascii') payloadutf8 = payload.decode(charset).encode('utf-8') except (StopIteration, UnicodeDecodeError): payloadutf8 = '' self.body = cleanupBody(payloadutf8)
def getCC(self): res = [] buf = getaddresses(self._msg.get_all('cc', '')) for i in buf: header = decode_header(i[0]) data = ''.join([ to_unicode(s, enc) for s, enc in header if self.codecs_lookup(enc) ]) res.append((to_entities_quote(data), i[1])) return res
def decode(x): b = decode_header(x) #print b u = u'' for c in b: if c[1]: #print c[0].decode(c[1]) u = u + c[0].decode(c[1]) else: #print c[0] u = u + c[0] return u
def get_header(header_text, default="ascii"): "Decode and return the header" if not header_text: return header_text sections = decode_header(header_text) parts = [] for section, encoding in sections: try: parts.append(section.decode(encoding or default, 'replace')) except LookupError: parts.append(section.decode(default, 'replace')) return u' '.join(parts)
def decode_charset(self, field): # TK: This function was rewritten for unifying to Unicode. # Convert 'field' into Unicode one line string. try: pairs = decode_header(field) ustr = make_header(pairs).__unicode__() except (LookupError, UnicodeError, ValueError, HeaderParseError): # assume list's language cset = Utils.GetCharSet(self._mlist.preferred_language) if cset == 'us-ascii': cset = 'iso-8859-1' # assume this for English list ustr = unicode(field, cset, 'replace') return u''.join(ustr.splitlines())