def parse(self, bytesfile): p = Parser() msgobj = p.parse(bytesfile) subject = self.parse_header_field(msgobj["Subject"]) attachments = [] body = [] html = [] self.parse_body(msgobj.walk(), attachments, body, html) body = u"\n".join(body) html = u"\n".join(html) tos = self.get_address_list(msgobj.get_all("To", [])) tos.extend(self.get_address_list(msgobj.get_all("X-Original-To", []))) ccs = self.get_address_list(msgobj.get_all("Cc", [])) resent_tos = self.get_address_list(msgobj.get_all("resent-to", [])) resent_ccs = self.get_address_list(msgobj.get_all("resent-cc", [])) from_field = parseaddr(self.get(msgobj.get("From"))) from_field = (self.parse_header_field(from_field[0]), from_field[1].lower() if from_field[1] else from_field[1]) date = self.parse_date(self.get(msgobj.get("Date"))) return { "msgobj": msgobj, "date": date, "subject": subject, "body": body, "html": html, "from": from_field, "to": tos, "cc": ccs, "resent_to": resent_tos, "resent_cc": resent_ccs, "attachments": attachments, }
def parse(self, bytesfile): p = Parser() msgobj = p.parse(bytesfile) subject = self.parse_header_field(msgobj['Subject']) body, html, attachments = self.parse_body(msgobj.walk()) body = '\n'.join(body) html = '\n'.join(html) tos = self.get_address_list(msgobj.get_all('To', [])) tos.extend(self.get_address_list(msgobj.get_all('X-Original-To', []))) ccs = self.get_address_list(msgobj.get_all('Cc', [])) resent_tos = self.get_address_list(msgobj.get_all('resent-to', [])) resent_ccs = self.get_address_list(msgobj.get_all('resent-cc', [])) from_field = parseaddr(self.get(msgobj.get('From'))) from_field = (self.parse_header_field(from_field[0]), from_field[1].lower() if from_field[1] else from_field[1]) date = self.parse_date(self.get(msgobj.get("Date"))) return { 'msgobj': msgobj, 'message_id': msgobj.get('Message-Id'), 'date': date, 'subject': subject, 'body': body, 'html': html, 'from': from_field, 'to': tos, 'cc': ccs, 'resent_to': resent_tos, 'resent_cc': resent_ccs, 'attachments': attachments }
def _get_content(self): # self.content is provided by __getattr__ through the cache var self._content p = BytesParser() content = self.content content_io = BytesIO(content) parsed_msg = p.parse(content_io) return parsed_msg
def get_metadata_for_wheel(url): data = requests.get(url).content with ZipFile(BytesIO(data)) as z: for n in z.namelist(): if n.endswith(".dist-info/METADATA"): p = BytesParser() return p.parse(z.open(n), headersonly=True) # If we didn't find the metadata, return an empty dict return EmailMessage()
def _get_metadata(self): if self._metadata: return with self.get_zip() as z: for n in z.namelist(): if n.endswith(".dist-info/METADATA"): p = BytesParser() self._metadata = p.parse(z.open(n), headersonly=True) return raise RuntimeError("Wheel has no metadata")
def get_email_headers(message_bytes, headers=None): p = Parser() with closing(BytesIO(message_bytes)) as stream: msgobj = p.parse(stream) if headers is None: headers = dict(msgobj) return { k: [parse_header_field(x) for x in msgobj.get_all(k, [])] for k in headers }
def list_mail( self, dir ): parser = BytesParser() list = [] for root, dirs, filenames in os.walk( dir ): for filename in filesnames: file = open( dir + '/' + filename, 'r' ) email = parser.parse( file, True ) list.append( {'subject': email['Subject'], 'from': email['From'] } ) return list
def get_metadata(self): if self.filetype != "wheel": print(f"{self.name}: No wheel fo type {self.filetype}") return if self.metadata: return data = requests.get(self.url).content with ZipFile(BytesIO(data)) as z: for n in z.namelist(): if n.endswith('.dist-info/METADATA'): p = BytesParser() self.metadata = p.parse(z.open(n), headersonly=True) break
def parse_attachment(self, message_part): content_disposition = message_part.get("Content-Disposition", None) if content_disposition: dispo_type, dispo_dict = self.parse_dispositions( content_disposition) if dispo_type == "attachment" or (dispo_type == 'inline' and 'filename' in dispo_dict): content_type = message_part.get("Content-Type", None) file_data = message_part.get_payload(decode=True) if file_data is None: payloads = message_part.get_payload() file_data = '\n\n'.join([p.as_string() for p in payloads]) try: file_data = file_data.encode('utf-8') except: pass attachment = BytesIO(file_data) attachment.content_type = message_part.get_content_type() attachment.size = len(file_data) attachment.name = None attachment.create_date = None attachment.mod_date = None attachment.read_date = None if "filename" in dispo_dict: attachment.name = dispo_dict['filename'] if content_type: _, content_dict = self.parse_dispositions(content_type) if 'name' in content_dict: attachment.name = content_dict['name'] if attachment.name is None and content_type == 'message/rfc822': p = Parser() msgobj = p.parse(BytesIO(attachment.getvalue())) subject = self.parse_header_field(msgobj['Subject']) if subject: attachment.name = '%s.eml' % subject[:45] if "create-date" in dispo_dict: attachment.create_date = dispo_dict[ 'create-date'] # TODO: datetime if "modification-date" in dispo_dict: attachment.mod_date = dispo_dict[ 'modification-date'] # TODO: datetime if "read-date" in dispo_dict: attachment.read_date = dispo_dict[ 'read-date'] # TODO: datetime return attachment return None
def parse(path: str) -> dict: with open(path, 'rb') as eml_file: parser = BytesParser() message = parser.parse(eml_file) subject = str(make_header(decode_header(message.get("subject")))) sender = str(make_header(decode_header(message.get("from")))) receiver = str(make_header(decode_header(message.get("to")))) date = message.get("date") subject = "" if subject is None else subject sender = "" if sender is None else sender receiver = "" if receiver is None else receiver date = "" if date is None else date mail = { "subject": subject, "from": sender, "to": receiver, "date": date, "content": "", "attachments": [], "path": path } content = "" attachments = [] last_is_plain_text = False for part in message.walk(): charset = part.get_content_charset() if not part.is_multipart(): content_type = part.get_content_type() file_name = part.get_filename() if file_name: file_name = str(make_header(decode_header(file_name))) file_data = part.get_payload(decode=True) file_data = process_attachment(file_name, file_data) attachments.append({"name": file_name, "content": file_data}) else: if not last_is_plain_text: if content_type in ['text/plain']: last_is_plain_text = True content = part.get_payload(decode=True) if charset: content = content.decode(charset) content = "" if content is None else content mail["content"] = content mail["attachments"] = attachments return mail
def parse_attachment(self, message_part): content_disposition = message_part.get("Content-Disposition", None) if content_disposition: dispo_type, dispo_dict = self.parse_dispositions(content_disposition) if dispo_type == "attachment" or (dispo_type == 'inline' and 'filename' in dispo_dict): content_type = message_part.get("Content-Type", None) file_data = message_part.get_payload(decode=True) if file_data is None: payloads = message_part.get_payload() file_data = '\n\n'.join([p.as_string() for p in payloads]) try: file_data = file_data.encode('utf-8') except: pass attachment = BytesIO(file_data) attachment.content_type = message_part.get_content_type() attachment.size = len(file_data) attachment.name = None attachment.create_date = None attachment.mod_date = None attachment.read_date = None if "filename" in dispo_dict: attachment.name = dispo_dict['filename'] if content_type: _, content_dict = self.parse_dispositions(content_type) if 'name' in content_dict: attachment.name = content_dict['name'] if attachment.name is None and content_type == 'message/rfc822': p = Parser() msgobj = p.parse(BytesIO(attachment.getvalue())) subject = self.parse_header_field(msgobj['Subject']) if subject: attachment.name = '%s.eml' % subject[:45] if "create-date" in dispo_dict: attachment.create_date = dispo_dict['create-date'] # TODO: datetime if "modification-date" in dispo_dict: attachment.mod_date = dispo_dict['modification-date'] # TODO: datetime if "read-date" in dispo_dict: attachment.read_date = dispo_dict['read-date'] # TODO: datetime return attachment return None
def parse(self, bytesfile): p = Parser() msgobj = p.parse(bytesfile) body, html, attachments = parse_email_body(msgobj) body = '\n'.join(body).strip() html = '\n'.join(html).strip() if not body and html: body = convert_html_to_text(html) email_info = parse_main_headers(msgobj) email_info.update({ 'body': body, 'html': html, 'attachments': attachments }) return ParsedEmail(msgobj, **email_info)
def parse_attachment(self, message_part): content_disposition = message_part.get("Content-Disposition", None) if content_disposition: dispo_type, dispo_dict = self.parse_dispositions(content_disposition) if dispo_type == "attachment" or (dispo_type == "inline" and "filename" in dispo_dict): content_type = message_part.get("Content-Type", None) file_data = message_part.get_payload(decode=True) if file_data is None: payloads = message_part.get_payload() file_data = "\n\n".join([p.as_string() for p in payloads]).encode("utf-8") attachment = BytesIO(file_data) attachment.content_type = message_part.get_content_type() attachment.size = len(file_data) attachment.name = None attachment.create_date = None attachment.mod_date = None attachment.read_date = None if "filename" in dispo_dict: attachment.name = dispo_dict["filename"] if content_type: _, content_dict = self.parse_dispositions(content_type) if "name" in content_dict: attachment.name = content_dict["name"] if attachment.name is None and content_type == "message/rfc822": p = Parser() msgobj = p.parse(BytesIO(attachment.getvalue())) subject = self.parse_header_field(msgobj["Subject"]) if subject: attachment.name = "%s.eml" % subject[:45] if "create-date" in dispo_dict: attachment.create_date = dispo_dict["create-date"] # TODO: datetime if "modification-date" in dispo_dict: attachment.mod_date = dispo_dict["modification-date"] # TODO: datetime if "read-date" in dispo_dict: attachment.read_date = dispo_dict["read-date"] # TODO: datetime return attachment return None
class ArchivesParser(object): def __init__(self): self.parser = BytesParser(policy=compat32) def parse(self, stream): self.rawtxt = stream.read() self.msg = self.parser.parse(io.BytesIO(self.rawtxt)) def is_msgid(self, msgid): # Look for a specific messageid. This means we might parse it twice, # but so be it. Any exception means we know it's not this one... try: if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid: return True except Exception as e: return False def analyze(self, date_override=None): self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) self._from = self.decode_mime_header(self.get_mandatory('From'), True) self.to = self.decode_mime_header(self.get_optional('To'), True) self.cc = self.decode_mime_header(self.get_optional('CC'), True) self.subject = self.decode_mime_header(self.get_optional('Subject')) if date_override: self.date = self.forgiving_date_decode(date_override) else: self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date'))) # Accept times up to 4 hours in the future, for badly synced clocks maxdate = datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(hours=4) if self.date > maxdate: # Date is in the future, we don't trust that. Instead, let's see if we can find # it in the raw text of the message. def _extract_date(d): m = _re_received.match(d) if m: try: return self.forgiving_date_decode(m.group(1).strip()) except IgnorableException: pass lowdate = min((x for x in map(_extract_date, self.msg.get_all('Received')) if x and x < maxdate)) if lowdate: self.date = lowdate # Else we're going to go with what we found self.bodytxt = self.get_body() self.attachments = [] self.get_attachments() if len(self.attachments) > 0: log.status("Found %s attachments" % len(self.attachments)) # Build an list of the message id's we are interested in self.parents = [] # The first one is in-reply-to, if it exists if self.get_optional('in-reply-to'): m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True) if m: self.parents.append(m) # Then we add all References values, in backwards order if self.get_optional('references'): cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())] # Can't do this with a simple self.parents.extend() due to broken # mailers that add the same reference more than once. And we can't # use a set() to make it unique, because order is very important for m in cleaned_msgids: if m and m not in self.parents: self.parents.append(m) def clean_charset(self, charset): lcharset = charset.lower() if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown': # Special case where we don't know... We'll assume # us-ascii and use replacements return 'us-ascii' if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset': # Seriously broken charset definitions, map to us-ascii # and throw away the rest with replacements return 'us-ascii' if lcharset == 'x-gbk': # Some MUAs set it to x-gbk, but there is a valid # declaratoin as gbk... return 'gbk' if lcharset == 'iso-8859-8-i': # -I is a special logical version, but should be the # same charset return 'iso-8859-8' if lcharset == 'windows-874': # This is an alias for iso-8859-11 return 'iso-8859-11' if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1': # Strange way of saying 8859.... return 'iso-8859-1' if lcharset == 'iso885915': return 'iso-8859-15' if lcharset == 'iso-latin-2': return 'iso-8859-2' if lcharset == 'iso-850': # Strange spelling of cp850 (windows charset) return 'cp850' if lcharset == 'koi8r': return 'koi8-r' if lcharset == 'cp 1252': return 'cp1252' if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii': # Why did this show up more than once?! return 'iso-8859-1' if lcharset == 'x-windows-949': return 'ms949' if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de': # This is a locale, and not a charset, but most likely it's this one return 'iso-8859-1' if lcharset == 'iso-8858-15': # How is this a *common* mistake? return 'iso-8859-15' if lcharset == 'macintosh': return 'mac_roman' if lcharset == 'cn-big5': return 'big5' if lcharset == 'x-unicode-2-0-utf-7': return 'utf-7' if lcharset == 'tscii': # No support for this charset :S Map it down to ascii # and throw away all the rest. sucks, but we have to return 'us-ascii' return charset def get_payload_as_unicode(self, msg): try: b = msg.get_payload(decode=True) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore it and hope for a better MIME part later. b = None if b: # Find out if there is a charset charset = None params = msg.get_params() if not params: # No content-type, so we assume us-ascii return str(b, 'us-ascii', errors='ignore') for k, v in params: if k.lower() == 'charset': charset = v break if charset: try: return str(b, self.clean_charset(charset), errors='ignore') except LookupError as e: raise IgnorableException("Failed to get unicode payload: %s" % e) else: # XXX: reasonable default? return str(b, errors='ignore') # Return None or empty string, depending on what we got back return b # Regular expression matching the PostgreSQL custom mail footer that # is appended to all emails. _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL) def get_body(self): b = self._get_body() if b: # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will # later reject.. if b.find('\udbff\n\udef8'): b = b.replace('\udbff\n\udef8', '') # Remove postgres specific mail footer - if it's there m = self._re_footer.match(b) if m: b = m.group(1) # Sometimes we end up with a trailing \0 when decoding long strings, so # replace it if it's there. # In fact, replace it everywhere, since it can also turn up in the middle # of a text when it's a really broken decoding. b = b.replace('\0', '') return b def _get_body(self): # This is where the magic happens - try to figure out what the body # of this message should render as. hasempty = False # First see if this is a single-part message that we can just # decode and go. b = self.get_payload_as_unicode(self.msg) if b: return b if b == '': # We found something, but it was empty. We'll keep looking as # there might be something better available, but make a note # that empty exists. hasempty = True # Ok, it's multipart. Find the first part that is text/plain, # and use that one. Do this recursively, since we may have something # like: # multipart/mixed: # multipart/alternative: # text/plain # text/html # application/octet-stream (attachment) b = self.recursive_first_plaintext(self.msg) if b: return b if b == '': hasempty = True # Couldn't find a plaintext. Look for the first HTML in that case. # Fallback, but what can we do at this point... b = self.recursive_first_plaintext(self.msg, True) if b: b = self.html_clean(b) if b: return b if b == '' or b is None: hasempty = True if hasempty: log.status('Found empty body in %s' % self.msgid) return '' raise IgnorableException("Don't know how to read the body from %s" % self.msgid) def recursive_first_plaintext(self, container, html_instead=False): pl = container.get_payload() if isinstance(pl, str): # This was not a multipart, but it leaked... Give up! return None for p in pl: if p.get_params() is None: # MIME multipart/mixed, but no MIME type on the part log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid) return self.get_payload_as_unicode(p) if p.get_params()[0][0].lower() == 'text/plain': # Don't include it if it looks like an attachment if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'): continue return self.get_payload_as_unicode(p) if html_instead and p.get_params()[0][0].lower() == 'text/html': # Don't include it if it looks like an attachment if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'): continue return self.get_payload_as_unicode(p) if p.is_multipart(): b = self.recursive_first_plaintext(p, html_instead) if b or b == '': return b # Yikes, nothing here! Hopefully we'll find something when # we continue looping at a higher level. return None def get_attachments(self): self.attachments_found_first_plaintext = False self.recursive_get_attachments(self.msg) # Clean a filenames encoding and return it as a unicode string def _clean_filename_encoding(self, filename): # If this is a header-encoded filename, start by decoding that if filename.startswith('=?'): decoded, encoding = decode_header(filename)[0] return str(decoded, encoding, errors='ignore') # If it's already unicode, just return it if isinstance(filename, str): return filename # Anything that's not UTF8, we just get rid of. We can live with # filenames slightly mangled in this case. return str(filename, 'utf-8', errors='ignore') def _extract_filename(self, container): # Try to get the filename for an attachment in the container. # If the standard library can figure one out, use that one. f = container.get_filename() if f: return self._clean_filename_encoding(f) # Failing that, some mailers set Content-Description to the # filename if 'Content-Description' in container: return self._clean_filename_encoding(container['Content-Description']) return None def recursive_get_attachments(self, container): # We start recursion in the "multipart" container if any if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed': # Multipart - worth scanning into if not container.is_multipart(): # Wow, this is broken. It's multipart/mixed, but doesn't # contain multiple parts. # Since we're just looking for attachments, let's just # ignore it... return for p in container.get_payload(): if p.get_params() is None: continue self.recursive_get_attachments(p) elif container.get_content_type() == 'multipart/alternative': # Alternative is not an attachment (we decide) # It's typilcally plantext + html self.attachments_found_first_plaintext = True return elif container.is_multipart(): # Other kinds of multipart, such as multipart/signed... return else: # Not a multipart. # Exclude specific contenttypes if container.get_content_type() == 'application/pgp-signature': return if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'): return # For now, accept anything not text/plain if container.get_content_type() != 'text/plain': try: self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return return # It's a text/plain, it might be worthwhile. # If it has a name, we consider it an attachments if not container.get_params(): return for k, v in container.get_params(): if k == 'name' and v != '': # Yes, it has a name try: self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return return # If it's content-disposition=attachment, we also want to save it if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'): try: self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return return # If we have already found one text/plain part, make all # further text/plain parts attachments if self.attachments_found_first_plaintext: # However, this will also *always* catch the MIME part added # by majordomo with the footer. So if that one is present, # we need to explicitly exclude it again. try: b = container.get_payload(decode=True) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return if isinstance(b, str) and not self._re_footer.match(b): # We know there is no name for this one self.attachments.append((None, container.get_content_type(), b)) return # Ok, so this was a plaintext that we ignored. Set the flag # that we have now ignored one, so we'll make the next one # an attachment. self.attachments_found_first_plaintext = True # No name, and text/plain, so ignore it re_msgid = re.compile('^\s*<(.*)>\s*') def clean_messageid(self, messageid, ignorebroken=False): m = self.re_msgid.match(messageid) if not m: if ignorebroken: log.status("Could not parse messageid '%s', ignoring it" % messageid) return None raise IgnorableException("Could not parse message id '%s'" % messageid) return m.groups(1)[0].replace(' ', '') # _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$') # Now using [^\s] instead of \w, to work with japanese chars _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$') _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$') _date_multiminus_re = re.compile(' -(-\d+)$') _date_offsetnoplus_re = re.compile(' (\d{4})$') def forgiving_date_decode(self, d): if d.strip() == '': raise IgnorableException("Failed to parse empty date") # Strange timezones requiring manual adjustments if d.endswith('-7700 (EST)'): d = d.replace('-7700 (EST)', 'EST') if d.endswith('+6700 (EST)'): d = d.replace('+6700 (EST)', 'EST') if d.endswith('+-4-30'): d = d.replace('+-4-30', '+0430') if d.endswith('+1.00'): d = d.replace('+1.00', '+0100') if d.endswith('+-100'): d = d.replace('+-100', '+0100') if d.endswith('+500'): d = d.replace('+500', '+0500') if d.endswith('-500'): d = d.replace('-500', '-0500') if d.endswith('-700'): d = d.replace('-700', '-0700') if d.endswith('-800'): d = d.replace('-800', '-0800') if d.endswith('+05-30'): d = d.replace('+05-30', '+0530') if d.endswith('+0-900'): d = d.replace('+0-900', '-0900') if d.endswith('Mexico/General'): d = d.replace('Mexico/General', 'CDT') if d.endswith('Pacific Daylight Time'): d = d.replace('Pacific Daylight Time', 'PDT') if d.endswith(' ZE2'): d = d.replace(' ZE2', ' +0200') if d.find('-Juin-') > 0: d = d.replace('-Juin-', '-Jun-') if d.find('-Juil-') > 0: d = d.replace('-Juil-', '-Jul-') if d.find(' 0 (GMT)'): d = d.replace(' 0 (GMT)', ' +0000') if self._date_multiminus_re.search(d): d = self._date_multiminus_re.sub(' \\1', d) if self._date_offsetnoplus_re.search(d): d = self._date_offsetnoplus_re.sub('+\\1', d) # We have a number of dates in the format # "<full datespace> +0200 (MET DST)" # or similar. The problem coming from the space within the # parenthesis, or if the contents of the parenthesis is # completely empty if self._date_multi_re.search(d): d = self._date_multi_re.sub('', d) # If the spec is instead # "<full datespace> +0200 (...)" # of any kind, we can just remove what's in the (), because the # parser is just going to rely on the fixed offset anyway. if self._date_multi_re2.search(d): d = self._date_multi_re2.sub(' \\1', d) try: dp = dateutil.parser.parse(d, fuzzy=True) # Some offsets are >16 hours, which postgresql will not # (for good reasons) accept if dp.utcoffset() and abs(dp.utcoffset().days * (24 * 60 * 60) + dp.utcoffset().seconds) > 60 * 60 * 16 - 1: # Convert it to a UTC timestamp using Python. It will give # us the right time, but the wrong timezone. Should be # enough... dp = datetime.datetime(*dp.utctimetuple()[:6]) if not dp.tzinfo: dp = dp.replace(tzinfo=datetime.timezone.utc) return dp except Exception as e: raise IgnorableException("Failed to parse date '%s': %s" % (d, e)) def _maybe_decode(self, s, charset): if isinstance(s, str): return s.strip(' ') return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ') # Workaround for broken quoting in some MUAs (see below) _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE) def _decode_mime_header(self, hdr, email_workaround): if hdr is None: return None # Per http://bugs.python.org/issue504152 (and lots of testing), it seems # we must get rid of the sequence \n\t at least in the header. If we # do this *before* doing any MIME decoding, we should be safe against # anybody *actually* putting that sequence in the header (since we # won't match the encoded contents) hdr = hdr.replace("\n\t", " ") # In at least some cases, at least gmail (and possibly other MUAs) # incorrectly put double quotes in the name/email field even when # it's encoded. That's not allowed - they have to be escaped - but # since there's a fair amount of those, we apply a regex to get # rid of them. m = self._re_mailworkaround.search(hdr) if m: hdr = self._re_mailworkaround.sub(r'\1', hdr) try: return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)]) except HeaderParseError as e: # Parser error is typically someone specifying an encoding, # but then not actually using that encoding. We'll do the best # we can, which is cut it down to ascii and ignore errors return str(hdr, 'us-ascii', errors='ignore').strip(' ') def decode_mime_header(self, hdr, email_workaround=False): try: if isinstance(hdr, Header): hdr = hdr.encode() h = self._decode_mime_header(hdr, email_workaround) if h: return h.replace("\0", "") return '' except LookupError as e: raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e)) except ValueError as ve: raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve)) def get_mandatory(self, fieldname): try: x = self.msg[fieldname] if x is None: raise Exception() return x except: raise IgnorableException("Mandatory field '%s' is missing" % fieldname) def get_optional(self, fieldname): try: return self.msg[fieldname] except: return '' def html_clean(self, html): # First we pass it through tidy (html, errors) = tidylib.tidy_document(html, options={ 'drop-proprietary-attributes': 1, 'alt-text': '', 'hide-comments': 1, 'output-xhtml': 1, 'show-body-only': 1, 'clean': 1, 'char-encoding': 'utf8', 'show-warnings': 0, 'show-info': 0, }) if errors: print(("HTML tidy failed for %s!" % self.msgid)) print(errors) return None try: cleaner = HTMLCleaner() cleaner.feed(html) return cleaner.get_text() except Exception as e: # Failed to parse the html, thus failed to clean it. so we must # give up... return None
class ArchivesParser(object): def __init__(self): self.parser = BytesParser(policy=compat32) def parse(self, stream): self.rawtxt = stream.read() self.msg = self.parser.parse(io.BytesIO(self.rawtxt)) def is_msgid(self, msgid): # Look for a specific messageid. This means we might parse it twice, # but so be it. Any exception means we know it's not this one... try: if self.clean_messageid( self.decode_mime_header( self.get_mandatory('Message-ID'))) == msgid: return True except Exception: return False def analyze(self, date_override=None): self.msgid = self.clean_messageid( self.decode_mime_header(self.get_mandatory('Message-ID'))) self._from = self.decode_mime_header(self.get_mandatory('From'), True) self.to = self.decode_mime_header(self.get_optional('To'), True) self.cc = self.decode_mime_header(self.get_optional('CC'), True) self.subject = self.decode_mime_header(self.get_optional('Subject')) if date_override: self.date = self.forgiving_date_decode(date_override) else: self.date = self.forgiving_date_decode( self.decode_mime_header(self.get_mandatory('Date'))) # Accept times up to 4 hours in the future, for badly synced clocks maxdate = datetime.datetime.now( datetime.timezone.utc) + datetime.timedelta(hours=4) if self.date > maxdate: # Date is in the future, we don't trust that. Instead, let's see if we can find # it in the raw text of the message. def _extract_date(d): m = _re_received.match(d) if m: try: return self.forgiving_date_decode( m.group(1).strip()) except IgnorableException: pass lowdate = min( (x for x in map(_extract_date, self.msg.get_all('Received')) if x and x < maxdate)) if lowdate: self.date = lowdate # Else we're going to go with what we found self.bodytxt = self.get_body() self.attachments = [] self.get_attachments() if len(self.attachments) > 0: log.status("Found %s attachments" % len(self.attachments)) # Build an list of the message id's we are interested in self.parents = [] # The first one is in-reply-to, if it exists if self.get_optional('in-reply-to'): m = self.clean_messageid( self.decode_mime_header(self.get_optional('in-reply-to')), True) if m: self.parents.append(m) # Then we add all References values, in backwards order if self.get_optional('references'): cleaned_msgids = [ self.clean_messageid(x, True) for x in reversed( self.decode_mime_header(self.get_optional( 'references')).split()) ] # Can't do this with a simple self.parents.extend() due to broken # mailers that add the same reference more than once. And we can't # use a set() to make it unique, because order is very important for m in cleaned_msgids: if m and m not in self.parents: self.parents.append(m) def clean_charset(self, charset): lcharset = charset.lower() if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown': # Special case where we don't know... We'll assume # us-ascii and use replacements return 'us-ascii' if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset': # Seriously broken charset definitions, map to us-ascii # and throw away the rest with replacements return 'us-ascii' if lcharset == 'x-gbk': # Some MUAs set it to x-gbk, but there is a valid # declaratoin as gbk... return 'gbk' if lcharset == 'iso-8859-8-i': # -I is a special logical version, but should be the # same charset return 'iso-8859-8' if lcharset == 'windows-874': # This is an alias for iso-8859-11 return 'iso-8859-11' if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1': # Strange way of saying 8859.... return 'iso-8859-1' if lcharset == 'iso885915': return 'iso-8859-15' if lcharset == 'iso-latin-2': return 'iso-8859-2' if lcharset == 'iso-850': # Strange spelling of cp850 (windows charset) return 'cp850' if lcharset == 'koi8r': return 'koi8-r' if lcharset == 'cp 1252': return 'cp1252' if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii': # Why did this show up more than once?! return 'iso-8859-1' if lcharset == 'x-windows-949': return 'ms949' if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de': # This is a locale, and not a charset, but most likely it's this one return 'iso-8859-1' if lcharset == 'iso-8858-15': # How is this a *common* mistake? return 'iso-8859-15' if lcharset == 'macintosh': return 'mac_roman' if lcharset == 'cn-big5': return 'big5' if lcharset == 'x-unicode-2-0-utf-7': return 'utf-7' if lcharset == 'tscii': # No support for this charset :S Map it down to ascii # and throw away all the rest. sucks, but we have to return 'us-ascii' return charset def get_payload_as_unicode(self, msg): try: b = msg.get_payload(decode=True) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore it and hope for a better MIME part later. b = None if b: # Find out if there is a charset charset = None params = msg.get_params() if not params: # No content-type, so we assume us-ascii return str(b, 'us-ascii', errors='ignore') for k, v in params: if k.lower() == 'charset': charset = v break if charset: try: return str(b, self.clean_charset(charset), errors='ignore') except LookupError as e: raise IgnorableException( "Failed to get unicode payload: %s" % e) else: # XXX: reasonable default? return str(b, errors='ignore') # Return None or empty string, depending on what we got back return b # Regular expression matching the PostgreSQL custom mail footer that # is appended to all emails. _re_footer = re.compile( r'(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL) def get_body(self): b = self._get_body() if b: # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will # later reject.. if b.find('\udbff\n\udef8'): b = b.replace('\udbff\n\udef8', '') # Remove postgres specific mail footer - if it's there m = self._re_footer.match(b) if m: b = m.group(1) # Sometimes we end up with a trailing \0 when decoding long strings, so # replace it if it's there. # In fact, replace it everywhere, since it can also turn up in the middle # of a text when it's a really broken decoding. b = b.replace('\0', '') return b def _get_body(self): # This is where the magic happens - try to figure out what the body # of this message should render as. hasempty = False # First see if this is a single-part message that we can just # decode and go. b = self.get_payload_as_unicode(self.msg) if b: return b if b == '': # We found something, but it was empty. We'll keep looking as # there might be something better available, but make a note # that empty exists. hasempty = True # Ok, it's multipart. Find the first part that is text/plain, # and use that one. Do this recursively, since we may have something # like: # multipart/mixed: # multipart/alternative: # text/plain # text/html # application/octet-stream (attachment) b = self.recursive_first_plaintext(self.msg) if b: return b if b == '': hasempty = True # Couldn't find a plaintext. Look for the first HTML in that case. # Fallback, but what can we do at this point... b = self.recursive_first_plaintext(self.msg, True) if b: b = self.html_clean(b) if b: return b if b == '' or b is None: hasempty = True if hasempty: log.status('Found empty body in %s' % self.msgid) return '' raise IgnorableException("Don't know how to read the body from %s" % self.msgid) def recursive_first_plaintext(self, container, html_instead=False): pl = container.get_payload() if isinstance(pl, str): # This was not a multipart, but it leaked... Give up! return None for p in pl: if p.get_params() is None: # MIME multipart/mixed, but no MIME type on the part log.status( "Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid) return self.get_payload_as_unicode(p) if p.get_params()[0][0].lower() == 'text/plain': # Don't include it if it looks like an attachment if 'Content-Disposition' in p and p[ 'Content-Disposition'].startswith('attachment'): continue t = self.get_payload_as_unicode(p) if t: return t if html_instead and p.get_params()[0][0].lower() == 'text/html': # Don't include it if it looks like an attachment if 'Content-Disposition' in p and p[ 'Content-Disposition'].startswith('attachment'): continue t = self.get_payload_as_unicode(p) if t: return t if p.is_multipart(): b = self.recursive_first_plaintext(p, html_instead) if b or b == '': return b # Yikes, nothing here! Hopefully we'll find something when # we continue looping at a higher level. return None def get_attachments(self): self.attachments_found_first_plaintext = False self.recursive_get_attachments(self.msg) # Clean a filenames encoding and return it as a unicode string def _clean_filename_encoding(self, filename): # If this is a header-encoded filename, start by decoding that if filename.startswith('=?'): decoded, encoding = decode_header(filename)[0] return str(decoded, encoding, errors='ignore') # If it's already unicode, just return it if isinstance(filename, str): return filename # Anything that's not UTF8, we just get rid of. We can live with # filenames slightly mangled in this case. return str(filename, 'utf-8', errors='ignore') def _extract_filename(self, container): # Try to get the filename for an attachment in the container. # If the standard library can figure one out, use that one. f = container.get_filename() if f: return self._clean_filename_encoding(f) # Failing that, some mailers set Content-Description to the # filename if 'Content-Description' in container: return self._clean_filename_encoding( container['Content-Description']) return None def recursive_get_attachments(self, container): # We start recursion in the "multipart" container if any if container.get_content_type( ) == 'multipart/mixed' or container.get_content_type( ) == 'multipart/signed': # Multipart - worth scanning into if not container.is_multipart(): # Wow, this is broken. It's multipart/mixed, but doesn't # contain multiple parts. # Since we're just looking for attachments, let's just # ignore it... return for p in container.get_payload(): if p.get_params() is None: continue self.recursive_get_attachments(p) elif container.get_content_type() == 'multipart/alternative': # Alternative is not an attachment (we decide) # It's typilcally plantext + html self.attachments_found_first_plaintext = True return elif container.is_multipart(): # Other kinds of multipart, such as multipart/signed... return else: # Not a multipart. # Exclude specific contenttypes if container.get_content_type() == 'application/pgp-signature': return if container.get_content_type() in ( 'application/pkcs7-signature', 'application/x-pkcs7-signature'): return # For now, accept anything not text/plain if container.get_content_type() != 'text/plain': try: self.attachments.append( (self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return return # It's a text/plain, it might be worthwhile. # If it has a name, we consider it an attachments if not container.get_params(): return for k, v in container.get_params(): if k == 'name' and v != '': # Yes, it has a name try: self.attachments.append( (self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return return # If it's content-disposition=attachment, we also want to save it if 'Content-Disposition' in container and container[ 'Content-Disposition'].startswith('attachment'): try: self.attachments.append( (self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return return # If we have already found one text/plain part, make all # further text/plain parts attachments if self.attachments_found_first_plaintext: # However, this will also *always* catch the MIME part added # by majordomo with the footer. So if that one is present, # we need to explicitly exclude it again. try: b = container.get_payload(decode=True) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return if isinstance(b, str) and not self._re_footer.match(b): # We know there is no name for this one self.attachments.append( (None, container.get_content_type(), b)) return # Ok, so this was a plaintext that we ignored. Set the flag # that we have now ignored one, so we'll make the next one # an attachment. self.attachments_found_first_plaintext = True # No name, and text/plain, so ignore it re_msgid = re.compile(r'^\s*<(.*)>\s*') def clean_messageid(self, messageid, ignorebroken=False): m = self.re_msgid.match(messageid) if not m: if ignorebroken: log.status("Could not parse messageid '%s', ignoring it" % messageid) return None raise IgnorableException("Could not parse message id '%s'" % messageid) return m.groups(1)[0].replace(' ', '') # _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$') # Now using [^\s] instead of \w, to work with japanese chars _date_multi_re = re.compile(r' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$') _date_multi_re2 = re.compile(r' ([\+-]\d{4}) \([^)]+\)$') _date_multiminus_re = re.compile(r' -(-\d+)$') _date_offsetnoplus_re = re.compile(r' (\d{4})$') def forgiving_date_decode(self, d): if d.strip() == '': raise IgnorableException("Failed to parse empty date") # Strange timezones requiring manual adjustments if d.endswith('-7700 (EST)'): d = d.replace('-7700 (EST)', 'EST') if d.endswith('+6700 (EST)'): d = d.replace('+6700 (EST)', 'EST') if d.endswith('+-4-30'): d = d.replace('+-4-30', '+0430') if d.endswith('+1.00'): d = d.replace('+1.00', '+0100') if d.endswith('+-100'): d = d.replace('+-100', '+0100') if d.endswith('+500'): d = d.replace('+500', '+0500') if d.endswith('-500'): d = d.replace('-500', '-0500') if d.endswith('-700'): d = d.replace('-700', '-0700') if d.endswith('-800'): d = d.replace('-800', '-0800') if d.endswith('+05-30'): d = d.replace('+05-30', '+0530') if d.endswith('+0-900'): d = d.replace('+0-900', '-0900') if d.endswith('Mexico/General'): d = d.replace('Mexico/General', 'CDT') if d.endswith('Pacific Daylight Time'): d = d.replace('Pacific Daylight Time', 'PDT') if d.endswith(' ZE2'): d = d.replace(' ZE2', ' +0200') if d.find('-Juin-') > 0: d = d.replace('-Juin-', '-Jun-') if d.find('-Juil-') > 0: d = d.replace('-Juil-', '-Jul-') if d.find(' 0 (GMT)'): d = d.replace(' 0 (GMT)', ' +0000') if self._date_multiminus_re.search(d): d = self._date_multiminus_re.sub(' \\1', d) if self._date_offsetnoplus_re.search(d): d = self._date_offsetnoplus_re.sub('+\\1', d) # We have a number of dates in the format # "<full datespace> +0200 (MET DST)" # or similar. The problem coming from the space within the # parenthesis, or if the contents of the parenthesis is # completely empty if self._date_multi_re.search(d): d = self._date_multi_re.sub('', d) # If the spec is instead # "<full datespace> +0200 (...)" # of any kind, we can just remove what's in the (), because the # parser is just going to rely on the fixed offset anyway. if self._date_multi_re2.search(d): d = self._date_multi_re2.sub(' \\1', d) try: dp = dateutil.parser.parse(d, fuzzy=True) # Some offsets are >16 hours, which postgresql will not # (for good reasons) accept if dp.utcoffset() and abs( dp.utcoffset().days * (24 * 60 * 60) + dp.utcoffset().seconds) > 60 * 60 * 16 - 1: # Convert it to a UTC timestamp using Python. It will give # us the right time, but the wrong timezone. Should be # enough... dp = datetime.datetime(*dp.utctimetuple()[:6]) if not dp.tzinfo: dp = dp.replace(tzinfo=datetime.timezone.utc) return dp except Exception as e: raise IgnorableException("Failed to parse date '%s': %s" % (d, e)) def _maybe_decode(self, s, charset): if isinstance(s, str): return s.strip(' ') return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ') # Workaround for broken quoting in some MUAs (see below) _re_mailworkaround = re.compile(r'"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE) def _decode_mime_header(self, hdr, email_workaround): if hdr is None: return None # Per http://bugs.python.org/issue504152 (and lots of testing), it seems # we must get rid of the sequence \n\t at least in the header. If we # do this *before* doing any MIME decoding, we should be safe against # anybody *actually* putting that sequence in the header (since we # won't match the encoded contents) hdr = hdr.replace("\n\t", " ") # In at least some cases, at least gmail (and possibly other MUAs) # incorrectly put double quotes in the name/email field even when # it's encoded. That's not allowed - they have to be escaped - but # since there's a fair amount of those, we apply a regex to get # rid of them. m = self._re_mailworkaround.search(hdr) if m: hdr = self._re_mailworkaround.sub(r'\1', hdr) try: return " ".join([ self._maybe_decode(s, charset) for s, charset in decode_header(hdr) ]) except HeaderParseError: # Parser error is typically someone specifying an encoding, # but then not actually using that encoding. We'll do the best # we can, which is cut it down to ascii and ignore errors return str(hdr, 'us-ascii', errors='ignore').strip(' ') def decode_mime_header(self, hdr, email_workaround=False): try: if isinstance(hdr, Header): hdr = hdr.encode() h = self._decode_mime_header(hdr, email_workaround) if h: return h.replace("\0", "") return '' except LookupError as e: raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e)) except ValueError as ve: raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve)) def get_mandatory(self, fieldname): try: x = self.msg[fieldname] if x is None: raise Exception() return x except Exception: raise IgnorableException("Mandatory field '%s' is missing" % fieldname) def get_optional(self, fieldname): try: return self.msg[fieldname] except Exception: return '' def html_clean(self, html): # First we pass it through tidy (html, errors) = tidylib.tidy_document(html, options={ 'drop-proprietary-attributes': 1, 'alt-text': '', 'hide-comments': 1, 'output-xhtml': 1, 'show-body-only': 1, 'clean': 1, 'char-encoding': 'utf8', 'show-warnings': 0, 'show-info': 0, }) if errors: print(("HTML tidy failed for %s!" % self.msgid)) print(errors) return None try: cleaner = HTMLCleaner() cleaner.feed(html) return cleaner.get_text() except Exception: # Failed to parse the html, thus failed to clean it. so we must # give up... return None
class EmailMessage(object): ''' Email Message. Messages should be converted to EmailMessage as soon as possible, to check whether the message is parsable as part of validating input. If a MIME message is not parsable, a new Message will be created that does conform and contains the original unparsable message in the body. ''' DEBUGGING = False def __init__(self, message_or_file=None): ''' Creates an EmailMessage from a Message or a file. Non-mime messages are converted to MIME "text/plain". >>> email_message = EmailMessage() >>> type(email_message) <class 'goodcrypto.mail.message.email_message.EmailMessage'> ''' self.bad_header_lines = [] self.parser = Parser() self._last_charset = constants.DEFAULT_CHAR_SET self._log = self._message = None if message_or_file is None: self._message = Message() elif isinstance(message_or_file, Message): self._message = message_or_file elif isinstance(message_or_file, EmailMessage): self._message = message_or_file.get_message() else: try: if isinstance(message_or_file, IOBase) or isinstance(message_or_file, StringIO): self.log_message('about to parse a message from a file') try: self._message = self.parser.parse(message_or_file) self.log_message('parsed message from file') except TypeError: message_or_file.seek(0, os.SEEK_SET) self.parser = BytesParser() self._message = self.parser.parse(message_or_file) self.log_message('parsed message from file as bytes') else: try: self.log_message('about to parse a message from a string') self._message = self.parser.parsestr(message_or_file) self.log_message('parsed message from string') except TypeError: self.parser = BytesParser() self._message = self.parser.parsebytes(message_or_file) self.log_message('parsed message from bytes') if not self.validate_message(): self._create_good_message_from_bad(message_or_file) except Exception: try: self.log_message('EXCEPTION - see syr.exception.log for details') record_exception() self._create_good_message_from_bad(message_or_file) # if we still don't have a good message, then blow up if not self.validate_message(): self.log_message('unable to create a valid message') raise MessageException() except Exception: record_exception() if self.DEBUGGING: try: self.log_message(self.to_string()) except: pass def get_header(self, key): ''' Get a header from an existing message. >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name >>> with open(get_encrypted_message_name('basic.txt')) as input_file: ... email_message = EmailMessage(input_file) ... crypto_software = email_message.get_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER) >>> crypto_software == 'GPG' True ''' try: value = self.get_message().__getitem__(key) except Exception: value = None return value def add_header(self, key, value): ''' Add a header to an existing message. >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name >>> with open(get_plain_message_name('basic.txt')) as input_file: ... email_message = EmailMessage(input_file) ... email_message.add_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER, 'GPG') ... crypto_software = email_message.get_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER) >>> crypto_software == 'GPG' True ''' self._message.__setitem__(key, value) def change_header(self, key, value): ''' Change a header to an existing message. >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name >>> with open(get_encrypted_message_name('bouncy-castle.txt')) as input_file: ... email_message = EmailMessage(input_file) ... email_message.change_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER, 'TestGPG') ... crypto_software = email_message.get_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER) >>> crypto_software == 'TestGPG' True ''' if key in self._message: self._message.replace_header(key, value) else: self.add_header(key, value) def delete_header(self, key): ''' Delete a header to an existing message. >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name >>> with open(get_encrypted_message_name('bouncy-castle.txt')) as input_file: ... email_message = EmailMessage(input_file) ... email_message.delete_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER) ... email_message.get_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER) is None True ''' self._message.__delitem__(key) def get_message(self): ''' Get the message. >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message >>> from goodcrypto.oce.test_constants import EDWARD_LOCAL_USER >>> email_message = get_basic_email_message() >>> email_message.get_message() is not None True >>> email_message.get_message().get(mime_constants.FROM_KEYWORD) == EDWARD_LOCAL_USER True ''' return self._message def set_message(self, new_message): ''' Set the new message. # Get a basic message first so we can avoid recursion >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message >>> from goodcrypto.oce.test_constants import EDWARD_LOCAL_USER >>> basic_email_message = get_basic_email_message().get_message() >>> email_message = EmailMessage() >>> email_message.get_message().get(mime_constants.FROM_KEYWORD) is None True >>> email_message.set_message(basic_email_message) >>> email_message.get_message().get(mime_constants.FROM_KEYWORD) == EDWARD_LOCAL_USER True ''' old_message = self._message if is_string(new_message): try: if isinstance(self.parser, Parser): self._message = self.parser.parsestr(new_message) else: self._message = self.parser.parsebytes(new_message.encode()) except: self._message = old_message record_exception() else: self._message = new_message # restore the old message if the new one isn't valid. if not self.validate_message(): self._message = old_message self.log_message('restored previous message') def validate_message(self): ''' Validate a message. Python's parser frequently accepts a message that has garbage in the header by simply adding all header items after the bad header line(s) to the body text; this can leave a pretty unmanageable message so we apply our own validation. >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message >>> from goodcrypto.oce.test_constants import EDWARD_LOCAL_USER >>> email_message = get_basic_email_message() >>> email_message.validate_message() True ''' try: validator = Validator(self) if validator.is_message_valid(): valid = True self.log_message('message is valid') else: valid = False self.log_message('message is invalid') self.log_message(validator.get_why()) except Exception as AttributeError: valid = False record_exception() return valid def get_text(self): ''' Gets text from the current Message. This method works with both plain and MIME messages, except open pgp mime. If the message is MIMEMultipart, the text is from the first text/plain part. >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message >>> email_message = get_basic_email_message() >>> text = email_message.get_text() >>> text == 'Test message text' True ''' text = None message = self.get_message() if is_open_pgp_mime(message): self.log_message("unable to get text from openpgp mime message") else: if message.is_multipart(): self.log_message("message is a MIMEMultipart") # get the first text/plain part result_ok = False part_index = 0 parts = message.get_payload() while part_index < len(parts) and not result_ok: part = message.get_payload(part_index) content_type = part.get_content_type() if content_type == mime_constants.TEXT_PLAIN_TYPE: text = self._get_decoded_payload(part) result_ok = True else: self.log_message("body part type is " + content_type) part_index += 1 else: text = self._get_decoded_payload(message) self.log_message("payload is a: {}".format(type(text))) return text def set_text(self, text, charset=None): ''' Sets text in the current Message. This method works with both plain and MIME messages, except open pgp mime. If the message is MIMEMultipart, the text is set in the first text/plain part. >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message >>> email_message = get_basic_email_message() >>> email_message.set_text('New test message text') True >>> text = email_message.get_text() >>> text == 'New test message text' True ''' if self.DEBUGGING: self.log_message("setting text:\n{}".format(text)) text_set = False message = self.get_message() if message.is_multipart(): # set the first text/plain part text_set = False part_index = 0 parts = message.get_payload() while part_index < len(parts) and not text_set: part = message.get_payload(part_index) content_type = part.get_content_type() if content_type == mime_constants.TEXT_PLAIN_TYPE: part.set_payload(text) text_set = True self.log_message('the first text/plain part found') else: self.log_message('body part type is {}'.format(content_type)) part_index += 1 if not text_set: charset, __ = get_charset(self._message, self._last_charset) self.log_message('no text_set char set: {}'.format(charset)) new_part = MIMEText(text, mime_constants.PLAIN_SUB_TYPE, charset) message.attach(new_part) text_set = True self.log_message('added a new text/plain part with text') elif is_open_pgp_mime(message): self.log_message("unable to set text from openpgp mime message") else: self.set_content(text, mime_constants.TEXT_PLAIN_TYPE, charset=charset) text_set = True if self.DEBUGGING: self.log_message("message after setting text:\n" + self.to_string()) self.log_message("set text:\n{}".format(text_set)) return text_set def get_content(self): ''' Get the message's content, decoding if bas64 or print-quoted encoded. >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message >>> email_message = get_basic_email_message() >>> text = email_message.get_content() >>> text == 'Test message text' True ''' decode = False msg = self.get_message() encoding = self.get_header(mime_constants.CONTENT_XFER_ENCODING_KEYWORD) if encoding is not None: encoding = encoding.lower() self.log_message('payloaded encoded with {}'.format(encoding)) # only use the encoding if it's not a multipart message if (encoding == mime_constants.QUOTED_PRINTABLE_ENCODING or encoding == mime_constants.BASE64_ENCODING): current_content_type = self.get_message().get_content_type() if (current_content_type is not None and current_content_type.lower().find(mime_constants.MULTIPART_PRIMARY_TYPE) < 0): decode = True self.log_message('decoding payload with {}'.format(encoding)) try: payload = self._get_decoded_payload(self.get_message(), decode=decode) if self.DEBUGGING: self.log_message('decoded payloaded:\n{}'.format(payload)) self.log_message('type of payload: {}'.format(type(payload))) except: record_exception() payload = message.get_payload() return payload def set_content(self, payload, content_type, charset=None): ''' Set the content of the message. >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message >>> email_message = get_basic_email_message() >>> email_message.set_content('New test message text', mime_constants.TEXT_PLAIN_TYPE) >>> text = email_message.get_content() >>> text == 'New test message text' True ''' # create a new message if one doesn't exist if self._message is None: self._message = Message() current_content_type = self.get_message().get_content_type() if current_content_type is None: current_content_type = content_type self.log_message('current content type: {}'.format(current_content_type)) self.log_message('setting content type: {}'.format(content_type)) if self.DEBUGGING: self.log_message('content:\n{}'.format(payload)) current_encoding = self.get_header(mime_constants.CONTENT_XFER_ENCODING_KEYWORD) if current_encoding is None: self._message.__setitem__(mime_constants.CONTENT_XFER_ENCODING_KEYWORD, mime_constants.BITS_8) self.log_message('setting content encoding: {}'.format(mime_constants.BITS_8)) # if this is a simple text or html message, then just update the payload if (content_type == current_content_type and (content_type == mime_constants.TEXT_PLAIN_TYPE or content_type == mime_constants.TEXT_HTML_TYPE)): if charset is None: charset, self._last_charset = get_charset(payload, self._last_charset) self.log_message('getting charset from payload: {}'.format(charset)) elif self._last_charset is None: self._last_charset = constants.DEFAULT_CHAR_SET self.log_message('setting last charset to default: {}'.format()) else: self.log_message('using preset charset: {}'.format(charset)) try: self.get_message().set_payload( self.encode_payload(payload, current_encoding), charset=charset) self.log_message('set payload with {} charset'.format(charset)) if self.DEBUGGING: self.log_message('payload set:\n{}'.format(payload)) except UnicodeEncodeError as error: self.log_message(error.reason) self.log_message('start: {} end: {}'.format(error.start, error.end)) self.log_message('object: {}'.format(error.object)) self.get_message().set_payload(self.encode_payload(payload, current_encoding)) self.log_message('setting payload without charset') self.get_message().set_type(content_type) else: from goodcrypto.mail.message.inspect_utils import is_content_type_mime self.log_message('attaching payload for {}'.format(content_type)) if content_type == mime_constants.OCTET_STREAM_TYPE: part = MIMEBase(mime_constants.APPLICATION_TYPE, mime_constants.OCTET_STREAM_SUB_TYPE) part.set_payload(open(payload,"rb").read()) encode_base64(part) part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(payload)) self.get_message().attach(part) elif is_content_type_mime(self.get_message()): if not self.get_message().is_multipart(): if charset is None: charset, self._last_charset = get_charset(payload, self._last_charset) self.log_message('setting content with char set: {}'.format(charset)) else: if self._last_charset is None: self._last_charset = constants.DEFAULT_CHAR_SET self.get_message().set_payload(self.encode_payload(payload, current_encoding), charset) self.log_message('set payload with {} charset'.format(charset)) self.get_message().set_type(content_type) elif content_type == mime_constants.TEXT_PLAIN_TYPE: if self.DEBUGGING: self.log_message('mime text payload:\n{}'.format(payload)) part = MIMEText(payload) if self.DEBUGGING: self.log_message('mime text part:\n{}'.format(part)) part.set_payload(self.encode_payload(payload, current_encoding)) if self.DEBUGGING: self.log_message('mime text part with payload:\n{}'.format(part)) self.get_message().attach(part) else: primary, __, secondary = content_type.partition(mime_constants.PRIMARY_TYPE_DELIMITER) part = MIMEBase(primary, secondary) part.set_payload(self.encode_payload(payload, current_encoding)) self.get_message().attach(part) def encode_payload(self, payload, current_encoding): ''' Encode the payload. Test extreme case. >>> email_message = EmailMessage() >>> email_message.encode_payload(None, None) ''' new_payload = payload if payload is not None and current_encoding is not None: """ """ if current_encoding == mime_constants.BASE64_ENCODING: if isinstance(payload, str): payload = payload.encode() new_payload = b64encode(payload) self.log_message('encoding payload with {}'.format(current_encoding)) elif current_encoding == mime_constants.QUOTED_PRINTABLE_ENCODING: if isinstance(payload, str): payload = payload.encode() new_payload = encodestring(payload) self.log_message('encoding payload with {}'.format(current_encoding)) return new_payload def is_probably_pgp(self): ''' Returns true if this is probably an OpenPGP message. >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name >>> with open(get_encrypted_message_name('open-pgp-mime.txt')) as input_file: ... mime_message = EmailMessage(input_file) ... mime_message.is_probably_pgp() True ''' is_pgp = is_open_pgp_mime(self.get_message()) if not is_pgp: content = self.get_content() if is_string(content): is_pgp = self.contains_pgp_message_delimters(content) self.log_message('message uses in line pgp: {}'.format(is_pgp)) elif isinstance(content, list): for part in content: if isinstance(part, Message): part_content = part.get_payload() else: part_content = part if is_string(part_content): is_pgp = self.contains_pgp_message_delimters(part_content) if is_pgp: self.log_message('part of message uses in line pgp: {}'.format(is_pgp)) break else: self.log_message('part of content type is: {}'.format(repr(part_content))) else: self.log_message('content type is: {}'.format(type(content))) return is_pgp def contains_pgp_message_delimters(self, text): ''' Returns true if text contains PGP message delimiters. >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name >>> with open(get_encrypted_message_name('open-pgp-mime.txt')) as input_file: ... text = input_file.read() ... email_message = EmailMessage() ... email_message.contains_pgp_message_delimters(text) True ''' return (isinstance(text, str) and text.find(oce_constants.BEGIN_PGP_MESSAGE) >= 0 and text.find(oce_constants.END_PGP_MESSAGE) >= 0) def contains_pgp_signature_delimeters(self, text): ''' Returns true if text contains PGP signature delimiters. >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name >>> with open(get_plain_message_name('pgp-signature.txt')) as input_file: ... text = input_file.read() ... email_message = EmailMessage() ... email_message.contains_pgp_signature_delimeters(text) True ''' return (isinstance(text, str) and text.find(oce_constants.BEGIN_PGP_SIGNATURE) >= 0 and text.find(oce_constants.END_PGP_SIGNATURE) >= 0) def get_pgp_signature_blocks(self): ''' Returns the PGP signature blocks with text, if there are any. >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name >>> with open(get_plain_message_name('pgp-signature.txt')) as input_file: ... mime_message = EmailMessage(input_file) ... signature_blocks = mime_message.get_pgp_signature_blocks() ... len(signature_blocks) > 0 True ''' def get_signed_data(content): ''' Get the signed data. ''' signature_block = None start_index = content.find(oce_constants.BEGIN_PGP_SIGNED_MESSAGE) if start_index < 0: start_index = content.find(oce_constants.BEGIN_PGP_SIGNATURE) end_index = content.find(oce_constants.END_PGP_SIGNATURE) if start_index >= 0 and end_index > start_index: signature_block = content[start_index:end_index + len(oce_constants.END_PGP_SIGNATURE)] return signature_block signature_blocks = [] if self.get_message().is_multipart(): self.log_message('check each of {} parts of message for a signature'.format( len(self.get_message().get_payload()))) part_index = 0 parts = self.get_message().get_payload() for part in parts: part_index += 1 if isinstance(part, str): content = part else: content = part.get_payload() if self.contains_pgp_signature_delimeters(content): is_signed = True signature_block = get_signed_data(content) if signature_block is not None: signature_blocks.append(signature_block) self.log_message('found signature block in part {}'.format(part_index)) part_index += 1 else: content = self._get_decoded_payload(self.get_message()) if isinstance(content, str) and self.contains_pgp_signature_delimeters(content): is_signed = True signature_block = get_signed_data(content) if signature_block is not None: signature_blocks.append(signature_block) self.log_message('found signature block in content') self.log_message('total signature blocks: {}'.format(len(signature_blocks))) return signature_blocks def remove_pgp_signature_blocks(self): ''' Remove the PGP signature blocks, if there are any. >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name >>> with open(get_plain_message_name('pgp-signature.txt')) as input_file: ... mime_message = EmailMessage(input_file) ... mime_message.remove_pgp_signature_blocks() ... signature_blocks = mime_message.get_pgp_signature_blocks() ... len(signature_blocks) == 0 True ''' def remove_signature(content): ''' Remove the signature from the content. ''' # remove the beginning signature lines if content.startswith(oce_constants.BEGIN_PGP_SIGNED_MESSAGE): begin_sig_lines = '' for line in content.split('\n'): if len(line.strip()) <= 0: break else: begin_sig_lines += '{}\n'.format(line) content = content[len(begin_sig_lines):] # remove the signature itself start_index = content.find(oce_constants.BEGIN_PGP_SIGNATURE) end_index = content.find(oce_constants.END_PGP_SIGNATURE) content = content[0:start_index] + content[end_index + len(oce_constants.END_PGP_SIGNATURE):] # remove the extra characters added around the message itself content = content.replace('- {}'.format(oce_constants.BEGIN_PGP_MESSAGE), oce_constants.BEGIN_PGP_MESSAGE) content = content.replace('- {}'.format(oce_constants.END_PGP_MESSAGE), oce_constants.END_PGP_MESSAGE) return content try: if self.get_message().is_multipart(): self.log_message('check each of {} parts of message for a signature'.format( len(self.get_message().get_payload()))) part_index = 0 parts = self.get_message().get_payload() for part in parts: part_index += 1 if isinstance(part, str): content = part else: content = self._get_decoded_payload(part) if self.contains_pgp_signature_delimeters(content): charset, __ = get_charset(part) self.log_message('set payload after removing sig with char set: {}'.format(charset)) part.set_payload(remove_signature(content), charset=charset) self.log_message('extracted signature block from part {}'.format(part_index)) else: content = self._get_decoded_payload(self.get_message()) if isinstance(content, str) and self.contains_pgp_signature_delimeters(content): charset, __ = get_charset(part) self.get_message().set_payload(remove_signature(content), charset=charset) self.log_message('extracted signature block from content with char set: {}'.format(charset)) except: self.log_message('EXCEPTION see syr.exception.log') record_exception() def write_to(self, output_file): ''' Write message to the specified file. >>> from goodcrypto.mail.utils.dirs import get_test_directory >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name >>> filename = get_encrypted_message_name('iso-8859-1-binary.txt') >>> with open(filename) as input_file: ... output_dir = get_test_directory() ... output_filename = os.path.join(output_dir, 'test-message.txt') ... mime_message = EmailMessage(input_file) ... with open(output_filename, 'w') as out: ... mime_message.write_to(out) ... os.path.exists(output_filename) ... mime_message.write_to(out) ... os.path.exists(output_filename) ... os.remove(output_filename) True True True True if os.path.exists(output_filename): os.remove(output_filename) ''' result_ok = False try: if isinstance(output_file, IOBase): if output_file.closed: with open(output_file.name, 'w') as out: out.write(self.to_string()) out.flush() else: output_file.write(self.to_string()) output_file.flush() elif isinstance(output_file, StringIO): output_file.write(self.to_string()) else: with open(output_file, 'w') as out: out.write(self.to_string()) out.flush() result_ok = True except Exception: record_exception() raise Exception return result_ok def to_string(self, charset=None, mangle_from=False): ''' Convert message to a string. >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name >>> filename = get_plain_message_name('basic.txt') >>> with open(filename) as input_file: ... file_content = input_file.read().replace('\\r\\n', '\\n') ... position = input_file.seek(os.SEEK_SET) ... email_message = EmailMessage(input_file) ... file_content.strip() == email_message.to_string().strip() True ''' string = None try: msg = self._message if charset is None: charset, __ = get_charset(msg, self._last_charset) self.log_message('char set in to_string(): {}'.format(charset)) # convert the message try: file_pointer = StringIO() message_generator = Generator(file_pointer, mangle_from_=mangle_from, maxheaderlen=78) message_generator.flatten(msg) string = file_pointer.getvalue() except Exception as AttributeError: try: self.log_message('unable to flatten message') record_exception(AttributeError) msg = self._message string = msg.as_string() except Exception as AttributeError: # we explicitly want to catch everything here, even NPE self.log_message('unable to convert message as_string') string = '{}\n\n{}'.format( '\n'.join(self.get_header_lines()), '\n'.join(self.get_content_lines())) if self.DEBUGGING: self.log_message("message string:\n{}".format(string)) except IOError as io_error: self.last_error = io_error self.log_message(io_error) except MessageException as msg_exception: self.last_error = msg_exception self.log_message(msg_exception) return string def get_header_lines(self): ''' Get message headers as a list of lines. The lines follow RFC 2822, with a maximum of 998 characters per line. Longer headers are folded using a leading tab. >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name >>> filename = get_plain_message_name('basic.txt') >>> with open(filename) as input_file: ... email_message = EmailMessage(input_file) ... len(email_message.get_header_lines()) > 0 True ''' max_line_length = 998 lines = [] keys = self._message.keys() for key in keys: value = self.get_header(key) if value is None: value = '' raw_line = '{}: {}'.format(key, value) if len(raw_line) > max_line_length: # add first line from this header part_line = raw_line[0:max_line_length] lines.append(part_line) raw_line = raw_line[:max_line_length] # add continuation lines while len(raw_line) > max_line_length: # make space for leading tab part_line = raw_line[0:max_line_length - 1] lines.append("\t" + part_line) raw_line = raw_line[max_line_length - 1:] if len(raw_line) > 0: lines.append(raw_line) return lines def get_content_lines(self): ''' Gets the message content as a list of lines. This is the part of the message after the header and the separating blank line, with no decoding. >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name >>> filename = get_plain_message_name('basic.txt') >>> with open(filename) as input_file: ... email_message = EmailMessage(input_file) ... len(email_message.get_content_lines()) > 0 True ''' lines = [] payloads = self._message.get_payload() if payloads is None: self.log_message('No content') else: if isinstance(payloads, str): lines = payloads.split('\n') else: for payload in payloads: if isinstance(payload, Message): lines += payload.as_string() else: lines += payload.split('\n') return lines def _parse_header_line(self, line, last_name): ''' Parse a header line (internal user only). >>> email_message = EmailMessage() >>> name, value, last_name = email_message._parse_header_line( ... 'Mime-Version: 1.0', 'Subject') >>> name == 'Mime-Version' True >>> value == '1.0' True ''' if line is None: name = value = last_name = None else: name, __, value = line.partition(':') if name is not None: name = name.strip() if name is None or len(name) <= 0: self.log_message("no header name in line: " + line) if last_name is not None: old_value = self.get_header(last_name) self.add_header(name, '{} {}\n'.format(old_value.strip('\n'), value.strip())) else: last_name = name if value is None: value = '' else: value = value.strip() try: # try adding the header line and see if python can parse it test_message = Message() test_message.__setitem__(name, value) if isinstance(self.parser, Parser): temp_header = self.parser.parsestr(test_message.as_string(unixfrom=False)) else: temp_header = self.parser.parsebytes(test_message.as_string(unixfrom=False).encode()) if temp_header.__len__() == 0: self.log_message('bad header: {}'.format(line)) self.bad_header_lines.append(line) else: # if the parser accept this header line, then keep it self.add_header(name, value) except Exception: record_exception() self.bad_header_lines.append(line) return name, value, last_name def _set_content_encoding(self, name, value): ''' Set encoding in content (internal use only). >>> email_message = EmailMessage() >>> email_message._set_content_encoding( ... mime_constants.CONTENT_TYPE_KEYWORD, 'charset=utf-8') ''' if name is None or value is None: self.log_message('no name or value defined while trying to set content encoding') elif name == mime_constants.CONTENT_TYPE_KEYWORD: try: # try to set the charset index = value.find('charset=') if index >= 0: charset = value[index + len('charset='):] if charset.startswith('"') and charset.endswith('"'): charset = charset[1:len(charset)-1] self._message.set_charset(charset) except Exception: record_exception() self._message.set_charset(constants.DEFAULT_CHAR_SET) elif name == mime_constants.CONTENT_XFER_ENCODING_KEYWORD: encoding_value = self._message.get( mime_constants.CONTENT_XFER_ENCODING_KEYWORD) self.log_message('message encoding: {}'.format(encoding_value)) if encoding_value is None or encoding_value.lower() != value.lower(): self._message.__delitem__(name) self._message.__setitem__(name, value) self.log_message('set message encoding: {}'.format(value)) def _get_decoded_payload(self, msg, decode=True): ''' Get the payload and decode it if necessary. >>> email_message = EmailMessage() >>> email_message._get_decoded_payload(None) ''' if msg is None: payload = None else: payload = msg.get_payload(decode=decode) if isinstance(payload, bytearray) or isinstance(payload, bytes): charset, __ = get_charset(msg, self._last_charset) self.log_message('decoding payload with char set: {}'.format(charset)) try: payload = payload.decode(encoding=charset) except: payload = payload.decode(encoding=charset, errors='replace') return payload def _create_new_header(self, message_string): ''' Create a new header from a corrupted message (internal use only). >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name >>> with open(get_plain_message_name('basic.txt')) as input_file: ... message_string = ''.join(input_file.readlines()) ... email_message = EmailMessage() ... body_text_lines = email_message._create_new_header(message_string) ... len(body_text_lines) > 0 True ''' last_name = None body_text_lines = None if message_string is None: self.log_message('no message string defined to create new header') else: self.log_message('starting to parse headers') lines = message_string.split('\n') header_count = 0 for line in lines: if line is None or len(line.strip()) <= 0: self.log_message('finished parsing headers') if header_count + 1 <= len(lines): body_text_lines = lines[header_count + 1:] else: body_text_lines = [] break else: header_count += 1 name, value, last_name = self._parse_header_line(line, last_name) if (name is not None and (name == mime_constants.CONTENT_TYPE_KEYWORD or name == mime_constants.CONTENT_XFER_ENCODING_KEYWORD) ): self._set_content_encoding(name, value) return body_text_lines def _create_new_body_text(self, body): ''' Create the body text from a corrupted message. >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name >>> with open(get_plain_message_name('basic.txt')) as input_file: ... email_message = EmailMessage(input_file.readlines()) ... email_message._create_new_body_text('Test new body text') ''' charset, __ = get_charset(self._message, self._last_charset) self.log_message('creating new body text with char set: {}'.format(charset)) try: body_text = '' for line in body: body_text += line.encode(charset) except Exception as body_exception: self.log_message(body_exception) record_exception() body_text = ''.join(body) if len(self.bad_header_lines) > 0: body_text += '\n\n{}\n'.format(i18n('Removed bad header lines')) for bad_header_line in self.bad_header_lines: body_text += ' {}\n'.format(bad_header_line) self._message.set_payload(body_text, charset=charset) def _create_good_message_from_bad(self, source): ''' Create a good message from a source that contains a corrupted message. >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name >>> with open(get_plain_message_name('bad-basic.txt')) as input_file: ... email_message = EmailMessage() ... email_message._create_good_message_from_bad(input_file) ''' try: # start with a fresh message self._message = Message() if isinstance(source, IOBase): source.seek(os.SEEK_SET) message_string = source.read() else: message_string = source body_text = self._create_new_header(message_string) if body_text: self._create_new_body_text(body_text) except Exception as message_exception: self.log_message(message_exception) record_exception() raise MessageException(message_exception) def init_new_message(self, from_addr, to_addr, subject, text=None): ''' Initialize a basic new message. Used primarily for testing. >>> # In honor of Kirk Wiebe, a whistleblower about Trailblazer, an NSA mass surveillance project. >>> from_user = '******' >>> to_user = '******' >>> email_message = EmailMessage() >>> email_message.init_new_message(from_user, to_user, "Test message", 'Test body text') ''' self.add_header(mime_constants.FROM_KEYWORD, from_addr) self.add_header(mime_constants.TO_KEYWORD, to_addr) self.add_header(mime_constants.SUBJECT_KEYWORD, subject) if text: self.set_text(text) def log_message_exception(self, exception_error, message, log_msg): ''' Log an exception. >>> from syr.log import BASE_LOG_DIR >>> from syr.user import whoami >>> email_message = EmailMessage() >>> email_message.log_message_exception(Exception, 'message', 'log message') >>> os.path.exists(os.path.join(BASE_LOG_DIR, whoami(), 'goodcrypto.mail.message.email_message.log')) True >>> os.path.exists(os.path.join(BASE_LOG_DIR, whoami(), 'syr.exception.log')) True ''' self.log_exception(log_msg, message_exception=exception_error) if message != None: try: self.log_message("message:\n" + message.to_string()) except Exception as exception_error2: self.log_message("unable to log message: {}".format(exception_error2)) def log_exception(self, log_msg, message_exception=None): ''' Log an exception. >>> from syr.log import BASE_LOG_DIR >>> from syr.user import whoami >>> email_message = EmailMessage() >>> email_message.log_exception('test') >>> os.path.exists(os.path.join(BASE_LOG_DIR, whoami(), 'goodcrypto.mail.message.email_message.log')) True >>> os.path.exists(os.path.join(BASE_LOG_DIR, whoami(), 'syr.exception.log')) True >>> email_message.log_exception('test', message_exception='message exception') ''' record_exception() self.log_message(log_msg) record_exception(message=log_msg) if message_exception is not None: if type(message_exception) == Exception: self.log_message(message_exception.value) record_exception(message=message_exception.value) elif type(message_exception) == str: self.log_message(message_exception) record_exception(message=message_exception) def log_message(self, message): ''' Log a message. >>> from syr.log import BASE_LOG_DIR >>> from syr.user import whoami >>> email_message = EmailMessage() >>> email_message.log_message('test') >>> os.path.exists(os.path.join(BASE_LOG_DIR, whoami(), 'goodcrypto.mail.message.email_message.log')) True ''' if self._log is None: self._log = LogFile() self._log.write_and_flush(message)