Пример #1
0
    def parse(self, bytesfile):
        p = Parser()
        msgobj = p.parse(bytesfile)
        subject = self.parse_header_field(msgobj["Subject"])
        attachments = []
        body = []
        html = []
        self.parse_body(msgobj.walk(), attachments, body, html)
        body = u"\n".join(body)
        html = u"\n".join(html)

        tos = self.get_address_list(msgobj.get_all("To", []))
        tos.extend(self.get_address_list(msgobj.get_all("X-Original-To", [])))
        ccs = self.get_address_list(msgobj.get_all("Cc", []))
        resent_tos = self.get_address_list(msgobj.get_all("resent-to", []))
        resent_ccs = self.get_address_list(msgobj.get_all("resent-cc", []))

        from_field = parseaddr(self.get(msgobj.get("From")))
        from_field = (self.parse_header_field(from_field[0]), from_field[1].lower() if from_field[1] else from_field[1])
        date = self.parse_date(self.get(msgobj.get("Date")))
        return {
            "msgobj": msgobj,
            "date": date,
            "subject": subject,
            "body": body,
            "html": html,
            "from": from_field,
            "to": tos,
            "cc": ccs,
            "resent_to": resent_tos,
            "resent_cc": resent_ccs,
            "attachments": attachments,
        }
Пример #2
0
    def parse(self, bytesfile):
        p = Parser()
        msgobj = p.parse(bytesfile)
        subject = self.parse_header_field(msgobj['Subject'])
        body, html, attachments = self.parse_body(msgobj.walk())
        body = '\n'.join(body)
        html = '\n'.join(html)

        tos = self.get_address_list(msgobj.get_all('To', []))
        tos.extend(self.get_address_list(msgobj.get_all('X-Original-To', [])))
        ccs = self.get_address_list(msgobj.get_all('Cc', []))
        resent_tos = self.get_address_list(msgobj.get_all('resent-to', []))
        resent_ccs = self.get_address_list(msgobj.get_all('resent-cc', []))

        from_field = parseaddr(self.get(msgobj.get('From')))
        from_field = (self.parse_header_field(from_field[0]),
                      from_field[1].lower() if from_field[1] else from_field[1])
        date = self.parse_date(self.get(msgobj.get("Date")))
        return {
            'msgobj': msgobj,
            'message_id': msgobj.get('Message-Id'),
            'date': date,
            'subject': subject,
            'body': body,
            'html': html,
            'from': from_field,
            'to': tos,
            'cc': ccs,
            'resent_to': resent_tos,
            'resent_cc': resent_ccs,
            'attachments': attachments
        }
Пример #3
0
 def _get_content(self):
     # self.content is provided by __getattr__ through the cache var self._content
     p = BytesParser()
     content = self.content
     content_io = BytesIO(content)
     parsed_msg = p.parse(content_io)
     return parsed_msg
Пример #4
0
def get_metadata_for_wheel(url):
    data = requests.get(url).content
    with ZipFile(BytesIO(data)) as z:
        for n in z.namelist():
            if n.endswith(".dist-info/METADATA"):
                p = BytesParser()
                return p.parse(z.open(n), headersonly=True)

    # If we didn't find the metadata, return an empty dict
    return EmailMessage()
Пример #5
0
 def _get_metadata(self):
     if self._metadata:
         return
     with self.get_zip() as z:
         for n in z.namelist():
             if n.endswith(".dist-info/METADATA"):
                 p = BytesParser()
                 self._metadata = p.parse(z.open(n), headersonly=True)
                 return
     raise RuntimeError("Wheel has no metadata")
Пример #6
0
def get_email_headers(message_bytes, headers=None):
    p = Parser()
    with closing(BytesIO(message_bytes)) as stream:
        msgobj = p.parse(stream)
    if headers is None:
        headers = dict(msgobj)
    return {
        k: [parse_header_field(x) for x in msgobj.get_all(k, [])]
        for k in headers
    }
Пример #7
0
def get_email_headers(message_bytes, headers=None):
    p = Parser()
    with closing(BytesIO(message_bytes)) as stream:
        msgobj = p.parse(stream)
    if headers is None:
        headers = dict(msgobj)
    return {
        k: [parse_header_field(x) for x in msgobj.get_all(k, [])]
        for k in headers
    }
Пример #8
0
	def list_mail( self, dir ):
		parser = BytesParser()
		list = []
		
		for root, dirs, filenames in os.walk( dir ):
			for filename in filesnames:
				file = open( dir + '/' + filename, 'r' )
				email = parser.parse( file, True )
				list.append( {'subject': email['Subject'], 'from': email['From'] } )

		return list
Пример #9
0
 def get_metadata(self):
     if self.filetype != "wheel":
         print(f"{self.name}: No wheel fo type {self.filetype}")
         return
     if self.metadata:
         return
     data = requests.get(self.url).content
     with ZipFile(BytesIO(data)) as z:
         for n in z.namelist():
             if n.endswith('.dist-info/METADATA'):
                 p = BytesParser()
                 self.metadata = p.parse(z.open(n), headersonly=True)
                 break
Пример #10
0
    def parse_attachment(self, message_part):
        content_disposition = message_part.get("Content-Disposition", None)
        if content_disposition:
            dispo_type, dispo_dict = self.parse_dispositions(
                content_disposition)
            if dispo_type == "attachment" or (dispo_type == 'inline'
                                              and 'filename' in dispo_dict):
                content_type = message_part.get("Content-Type", None)
                file_data = message_part.get_payload(decode=True)
                if file_data is None:
                    payloads = message_part.get_payload()
                    file_data = '\n\n'.join([p.as_string() for p in payloads])
                    try:
                        file_data = file_data.encode('utf-8')
                    except:
                        pass

                attachment = BytesIO(file_data)
                attachment.content_type = message_part.get_content_type()
                attachment.size = len(file_data)
                attachment.name = None
                attachment.create_date = None
                attachment.mod_date = None
                attachment.read_date = None
                if "filename" in dispo_dict:
                    attachment.name = dispo_dict['filename']
                if content_type:
                    _, content_dict = self.parse_dispositions(content_type)
                    if 'name' in content_dict:
                        attachment.name = content_dict['name']
                if attachment.name is None and content_type == 'message/rfc822':
                    p = Parser()
                    msgobj = p.parse(BytesIO(attachment.getvalue()))
                    subject = self.parse_header_field(msgobj['Subject'])
                    if subject:
                        attachment.name = '%s.eml' % subject[:45]
                if "create-date" in dispo_dict:
                    attachment.create_date = dispo_dict[
                        'create-date']  # TODO: datetime
                if "modification-date" in dispo_dict:
                    attachment.mod_date = dispo_dict[
                        'modification-date']  # TODO: datetime
                if "read-date" in dispo_dict:
                    attachment.read_date = dispo_dict[
                        'read-date']  # TODO: datetime
                return attachment
        return None
Пример #11
0
def parse(path: str) -> dict:
    with open(path, 'rb') as eml_file:
        parser = BytesParser()
        message = parser.parse(eml_file)
        subject = str(make_header(decode_header(message.get("subject"))))
        sender = str(make_header(decode_header(message.get("from"))))
        receiver = str(make_header(decode_header(message.get("to"))))
        date = message.get("date")
        subject = "" if subject is None else subject
        sender = "" if sender is None else sender
        receiver = "" if receiver is None else receiver
        date = "" if date is None else date
    mail = {
        "subject": subject,
        "from": sender,
        "to": receiver,
        "date": date,
        "content": "",
        "attachments": [],
        "path": path
    }
    content = ""
    attachments = []
    last_is_plain_text = False
    for part in message.walk():
        charset = part.get_content_charset()
        if not part.is_multipart():
            content_type = part.get_content_type()
            file_name = part.get_filename()
            if file_name:
                file_name = str(make_header(decode_header(file_name)))
                file_data = part.get_payload(decode=True)
                file_data = process_attachment(file_name, file_data)
                attachments.append({"name": file_name, "content": file_data})
            else:
                if not last_is_plain_text:
                    if content_type in ['text/plain']:
                        last_is_plain_text = True
                    content = part.get_payload(decode=True)
                    if charset:
                        content = content.decode(charset)
    content = "" if content is None else content
    mail["content"] = content
    mail["attachments"] = attachments
    return mail
    def parse_attachment(self, message_part):
        content_disposition = message_part.get("Content-Disposition", None)
        if content_disposition:
            dispo_type, dispo_dict = self.parse_dispositions(content_disposition)
            if dispo_type == "attachment" or (dispo_type == 'inline' and
                    'filename' in dispo_dict):
                content_type = message_part.get("Content-Type", None)
                file_data = message_part.get_payload(decode=True)
                if file_data is None:
                    payloads = message_part.get_payload()
                    file_data = '\n\n'.join([p.as_string() for p in payloads])
                    try:
                        file_data = file_data.encode('utf-8')
                    except:
                        pass

                attachment = BytesIO(file_data)
                attachment.content_type = message_part.get_content_type()
                attachment.size = len(file_data)
                attachment.name = None
                attachment.create_date = None
                attachment.mod_date = None
                attachment.read_date = None
                if "filename" in dispo_dict:
                    attachment.name = dispo_dict['filename']
                if content_type:
                    _, content_dict = self.parse_dispositions(content_type)
                    if 'name' in content_dict:
                        attachment.name = content_dict['name']
                if attachment.name is None and content_type == 'message/rfc822':
                    p = Parser()
                    msgobj = p.parse(BytesIO(attachment.getvalue()))
                    subject = self.parse_header_field(msgobj['Subject'])
                    if subject:
                        attachment.name = '%s.eml' % subject[:45]
                if "create-date" in dispo_dict:
                    attachment.create_date = dispo_dict['create-date']  # TODO: datetime
                if "modification-date" in dispo_dict:
                    attachment.mod_date = dispo_dict['modification-date']  # TODO: datetime
                if "read-date" in dispo_dict:
                    attachment.read_date = dispo_dict['read-date']  # TODO: datetime
                return attachment
        return None
Пример #13
0
    def parse(self, bytesfile):
        p = Parser()
        msgobj = p.parse(bytesfile)

        body, html, attachments = parse_email_body(msgobj)
        body = '\n'.join(body).strip()
        html = '\n'.join(html).strip()

        if not body and html:
            body = convert_html_to_text(html)

        email_info = parse_main_headers(msgobj)
        email_info.update({
            'body': body,
            'html': html,
            'attachments': attachments
        })

        return ParsedEmail(msgobj, **email_info)
Пример #14
0
    def parse(self, bytesfile):
        p = Parser()
        msgobj = p.parse(bytesfile)

        body, html, attachments = parse_email_body(msgobj)
        body = '\n'.join(body).strip()
        html = '\n'.join(html).strip()

        if not body and html:
            body = convert_html_to_text(html)

        email_info = parse_main_headers(msgobj)
        email_info.update({
            'body': body,
            'html': html,
            'attachments': attachments
        })

        return ParsedEmail(msgobj, **email_info)
Пример #15
0
 def parse_attachment(self, message_part):
     content_disposition = message_part.get("Content-Disposition", None)
     if content_disposition:
         dispo_type, dispo_dict = self.parse_dispositions(content_disposition)
         if dispo_type == "attachment" or (dispo_type == "inline" and "filename" in dispo_dict):
             content_type = message_part.get("Content-Type", None)
             file_data = message_part.get_payload(decode=True)
             if file_data is None:
                 payloads = message_part.get_payload()
                 file_data = "\n\n".join([p.as_string() for p in payloads]).encode("utf-8")
             attachment = BytesIO(file_data)
             attachment.content_type = message_part.get_content_type()
             attachment.size = len(file_data)
             attachment.name = None
             attachment.create_date = None
             attachment.mod_date = None
             attachment.read_date = None
             if "filename" in dispo_dict:
                 attachment.name = dispo_dict["filename"]
             if content_type:
                 _, content_dict = self.parse_dispositions(content_type)
                 if "name" in content_dict:
                     attachment.name = content_dict["name"]
             if attachment.name is None and content_type == "message/rfc822":
                 p = Parser()
                 msgobj = p.parse(BytesIO(attachment.getvalue()))
                 subject = self.parse_header_field(msgobj["Subject"])
                 if subject:
                     attachment.name = "%s.eml" % subject[:45]
             if "create-date" in dispo_dict:
                 attachment.create_date = dispo_dict["create-date"]  # TODO: datetime
             if "modification-date" in dispo_dict:
                 attachment.mod_date = dispo_dict["modification-date"]  # TODO: datetime
             if "read-date" in dispo_dict:
                 attachment.read_date = dispo_dict["read-date"]  # TODO: datetime
             return attachment
     return None
Пример #16
0
class ArchivesParser(object):
    def __init__(self):
        self.parser = BytesParser(policy=compat32)

    def parse(self, stream):
        self.rawtxt = stream.read()
        self.msg = self.parser.parse(io.BytesIO(self.rawtxt))

    def is_msgid(self, msgid):
        # Look for a specific messageid. This means we might parse it twice,
        # but so be it. Any exception means we know it's not this one...
        try:
            if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid:
                return True
        except Exception as e:
            return False

    def analyze(self, date_override=None):
        self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID')))
        self._from = self.decode_mime_header(self.get_mandatory('From'), True)
        self.to = self.decode_mime_header(self.get_optional('To'), True)
        self.cc = self.decode_mime_header(self.get_optional('CC'), True)
        self.subject = self.decode_mime_header(self.get_optional('Subject'))
        if date_override:
            self.date = self.forgiving_date_decode(date_override)
        else:
            self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date')))

            # Accept times up to 4 hours in the future, for badly synced clocks
            maxdate = datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(hours=4)
            if self.date > maxdate:
                # Date is in the future, we don't trust that. Instead, let's see if we can find
                # it in the raw text of the message.
                def _extract_date(d):
                    m = _re_received.match(d)
                    if m:
                        try:
                            return self.forgiving_date_decode(m.group(1).strip())
                        except IgnorableException:
                            pass

                lowdate = min((x for x in map(_extract_date, self.msg.get_all('Received')) if x and x < maxdate))
                if lowdate:
                    self.date = lowdate
                # Else we're going to go with what we found
        self.bodytxt = self.get_body()
        self.attachments = []
        self.get_attachments()
        if len(self.attachments) > 0:
            log.status("Found %s attachments" % len(self.attachments))

        # Build an list of the message id's we are interested in
        self.parents = []
        # The first one is in-reply-to, if it exists
        if self.get_optional('in-reply-to'):
            m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True)
            if m:
                self.parents.append(m)

        # Then we add all References values, in backwards order
        if self.get_optional('references'):
            cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())]
            # Can't do this with a simple self.parents.extend() due to broken
            # mailers that add the same reference more than once. And we can't
            # use a set() to make it unique, because order is very important
            for m in cleaned_msgids:
                if m and m not in self.parents:
                    self.parents.append(m)

    def clean_charset(self, charset):
        lcharset = charset.lower()
        if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown':
            # Special case where we don't know... We'll assume
            # us-ascii and use replacements
            return 'us-ascii'
        if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset':
            # Seriously broken charset definitions, map to us-ascii
            # and throw away the rest with replacements
            return 'us-ascii'
        if lcharset == 'x-gbk':
            # Some MUAs set it to x-gbk, but there is a valid
            # declaratoin as gbk...
            return 'gbk'
        if lcharset == 'iso-8859-8-i':
            # -I is a special logical version, but should be the
            # same charset
            return 'iso-8859-8'
        if lcharset == 'windows-874':
            # This is an alias for iso-8859-11
            return 'iso-8859-11'
        if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1':
            # Strange way of saying 8859....
            return 'iso-8859-1'
        if lcharset == 'iso885915':
            return 'iso-8859-15'
        if lcharset == 'iso-latin-2':
            return 'iso-8859-2'
        if lcharset == 'iso-850':
            # Strange spelling of cp850 (windows charset)
            return 'cp850'
        if lcharset == 'koi8r':
            return 'koi8-r'
        if lcharset == 'cp 1252':
            return 'cp1252'
        if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii':
            # Why did this show up more than once?!
            return 'iso-8859-1'
        if lcharset == 'x-windows-949':
            return 'ms949'
        if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de':
            # This is a locale, and not a charset, but most likely it's this one
            return 'iso-8859-1'
        if lcharset == 'iso-8858-15':
            # How is this a *common* mistake?
            return 'iso-8859-15'
        if lcharset == 'macintosh':
            return 'mac_roman'
        if lcharset == 'cn-big5':
            return 'big5'
        if lcharset == 'x-unicode-2-0-utf-7':
            return 'utf-7'
        if lcharset == 'tscii':
            # No support for this charset :S Map it down to ascii
            # and throw away all the rest. sucks, but we have to
            return 'us-ascii'
        return charset

    def get_payload_as_unicode(self, msg):
        try:
            b = msg.get_payload(decode=True)
        except AssertionError:
            # Badly encoded data can throw an exception here, where the python
            # libraries fail to handle it and enters a cannot-happen path.
            # In which case we just ignore it and hope for a better MIME part later.
            b = None

        if b:
            # Find out if there is a charset
            charset = None
            params = msg.get_params()
            if not params:
                # No content-type, so we assume us-ascii
                return str(b, 'us-ascii', errors='ignore')
            for k, v in params:
                if k.lower() == 'charset':
                    charset = v
                    break
            if charset:
                try:
                    return str(b, self.clean_charset(charset), errors='ignore')
                except LookupError as e:
                    raise IgnorableException("Failed to get unicode payload: %s" % e)
            else:
                # XXX: reasonable default?
                return str(b, errors='ignore')
        # Return None or empty string, depending on what we got back
        return b

    # Regular expression matching the PostgreSQL custom mail footer that
    # is appended to all emails.
    _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL)

    def get_body(self):
        b = self._get_body()
        if b:
            # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will
            # later reject..
            if b.find('\udbff\n\udef8'):
                b = b.replace('\udbff\n\udef8', '')

        # Remove postgres specific mail footer - if it's there
        m = self._re_footer.match(b)
        if m:
            b = m.group(1)

        # Sometimes we end up with a trailing \0 when decoding long strings, so
        # replace it if it's there.
        # In fact, replace it everywhere, since it can also turn up in the middle
        # of a text when it's a really broken decoding.
        b = b.replace('\0', '')

        return b

    def _get_body(self):
        # This is where the magic happens - try to figure out what the body
        # of this message should render as.
        hasempty = False

        # First see if this is a single-part message that we can just
        # decode and go.
        b = self.get_payload_as_unicode(self.msg)
        if b:
            return b
        if b == '':
            # We found something, but it was empty. We'll keep looking as
            # there might be something better available, but make a note
            # that empty exists.
            hasempty = True

        # Ok, it's multipart. Find the first part that is text/plain,
        # and use that one. Do this recursively, since we may have something
        # like:
        # multipart/mixed:
        #   multipart/alternative:
        #      text/plain
        #      text/html
        #   application/octet-stream (attachment)
        b = self.recursive_first_plaintext(self.msg)
        if b:
            return b
        if b == '':
            hasempty = True

        # Couldn't find a plaintext. Look for the first HTML in that case.
        # Fallback, but what can we do at this point...
        b = self.recursive_first_plaintext(self.msg, True)
        if b:
            b = self.html_clean(b)
            if b:
                return b
        if b == '' or b is None:
            hasempty = True

        if hasempty:
            log.status('Found empty body in %s' % self.msgid)
            return ''
        raise IgnorableException("Don't know how to read the body from %s" % self.msgid)

    def recursive_first_plaintext(self, container, html_instead=False):
        pl = container.get_payload()
        if isinstance(pl, str):
            # This was not a multipart, but it leaked... Give up!
            return None
        for p in pl:
            if p.get_params() is None:
                # MIME multipart/mixed, but no MIME type on the part
                log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid)
                return self.get_payload_as_unicode(p)
            if p.get_params()[0][0].lower() == 'text/plain':
                # Don't include it if it looks like an attachment
                if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
                    continue
                return self.get_payload_as_unicode(p)
            if html_instead and p.get_params()[0][0].lower() == 'text/html':
                # Don't include it if it looks like an attachment
                if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
                    continue
                return self.get_payload_as_unicode(p)
            if p.is_multipart():
                b = self.recursive_first_plaintext(p, html_instead)
                if b or b == '':
                    return b

        # Yikes, nothing here! Hopefully we'll find something when
        # we continue looping at a higher level.
        return None

    def get_attachments(self):
        self.attachments_found_first_plaintext = False
        self.recursive_get_attachments(self.msg)

    # Clean a filenames encoding and return it as a unicode string
    def _clean_filename_encoding(self, filename):
        # If this is a header-encoded filename, start by decoding that
        if filename.startswith('=?'):
            decoded, encoding = decode_header(filename)[0]
            return str(decoded, encoding, errors='ignore')

        # If it's already unicode, just return it
        if isinstance(filename, str):
            return filename

        # Anything that's not UTF8, we just get rid of. We can live with
        # filenames slightly mangled in this case.
        return str(filename, 'utf-8', errors='ignore')

    def _extract_filename(self, container):
        # Try to get the filename for an attachment in the container.
        # If the standard library can figure one out, use that one.
        f = container.get_filename()
        if f:
            return self._clean_filename_encoding(f)

        # Failing that, some mailers set Content-Description to the
        # filename
        if 'Content-Description' in container:
            return self._clean_filename_encoding(container['Content-Description'])
        return None

    def recursive_get_attachments(self, container):
        # We start recursion in the "multipart" container if any
        if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed':
            # Multipart - worth scanning into
            if not container.is_multipart():
                # Wow, this is broken. It's multipart/mixed, but doesn't
                # contain multiple parts.
                # Since we're just looking for attachments, let's just
                # ignore it...
                return
            for p in container.get_payload():
                if p.get_params() is None:
                    continue
                self.recursive_get_attachments(p)
        elif container.get_content_type() == 'multipart/alternative':
            # Alternative is not an attachment (we decide)
            # It's typilcally plantext + html
            self.attachments_found_first_plaintext = True
            return
        elif container.is_multipart():
            # Other kinds of multipart, such as multipart/signed...
            return
        else:
            # Not a multipart.
            # Exclude specific contenttypes
            if container.get_content_type() == 'application/pgp-signature':
                return
            if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'):
                return
            # For now, accept anything not text/plain
            if container.get_content_type() != 'text/plain':
                try:
                    self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
                except AssertionError:
                    # Badly encoded data can throw an exception here, where the python
                    # libraries fail to handle it and enters a cannot-happen path.
                    # In which case we just ignore this attachment.
                    return
                return

            # It's a text/plain, it might be worthwhile.
            # If it has a name, we consider it an attachments
            if not container.get_params():
                return
            for k, v in container.get_params():
                if k == 'name' and v != '':
                    # Yes, it has a name
                    try:
                        self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
                    except AssertionError:
                        # Badly encoded data can throw an exception here, where the python
                        # libraries fail to handle it and enters a cannot-happen path.
                        # In which case we just ignore this attachment.
                        return

                    return

            # If it's content-disposition=attachment, we also want to save it
            if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'):
                try:
                    self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
                except AssertionError:
                    # Badly encoded data can throw an exception here, where the python
                    # libraries fail to handle it and enters a cannot-happen path.
                    # In which case we just ignore this attachment.
                    return

                return

            # If we have already found one text/plain part, make all
            # further text/plain parts attachments
            if self.attachments_found_first_plaintext:
                # However, this will also *always* catch the MIME part added
                # by majordomo with the footer. So if that one is present,
                # we need to explicitly exclude it again.
                try:
                    b = container.get_payload(decode=True)
                except AssertionError:
                    # Badly encoded data can throw an exception here, where the python
                    # libraries fail to handle it and enters a cannot-happen path.
                    # In which case we just ignore this attachment.
                    return

                if isinstance(b, str) and not self._re_footer.match(b):
                    # We know there is no name for this one
                    self.attachments.append((None, container.get_content_type(), b))
                return

            # Ok, so this was a plaintext that we ignored. Set the flag
            # that we have now ignored one, so we'll make the next one
            # an attachment.
            self.attachments_found_first_plaintext = True
            # No name, and text/plain, so ignore it

    re_msgid = re.compile('^\s*<(.*)>\s*')

    def clean_messageid(self, messageid, ignorebroken=False):
        m = self.re_msgid.match(messageid)
        if not m:
            if ignorebroken:
                log.status("Could not parse messageid '%s', ignoring it" % messageid)
                return None
            raise IgnorableException("Could not parse message id '%s'" % messageid)
        return m.groups(1)[0].replace(' ', '')

#    _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$')
    # Now using [^\s] instead of \w, to work with japanese chars
    _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$')
    _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$')
    _date_multiminus_re = re.compile(' -(-\d+)$')
    _date_offsetnoplus_re = re.compile(' (\d{4})$')

    def forgiving_date_decode(self, d):
        if d.strip() == '':
            raise IgnorableException("Failed to parse empty date")
        # Strange timezones requiring manual adjustments
        if d.endswith('-7700 (EST)'):
            d = d.replace('-7700 (EST)', 'EST')
        if d.endswith('+6700 (EST)'):
            d = d.replace('+6700 (EST)', 'EST')
        if d.endswith('+-4-30'):
            d = d.replace('+-4-30', '+0430')
        if d.endswith('+1.00'):
            d = d.replace('+1.00', '+0100')
        if d.endswith('+-100'):
            d = d.replace('+-100', '+0100')
        if d.endswith('+500'):
            d = d.replace('+500', '+0500')
        if d.endswith('-500'):
            d = d.replace('-500', '-0500')
        if d.endswith('-700'):
            d = d.replace('-700', '-0700')
        if d.endswith('-800'):
            d = d.replace('-800', '-0800')
        if d.endswith('+05-30'):
            d = d.replace('+05-30', '+0530')
        if d.endswith('+0-900'):
            d = d.replace('+0-900', '-0900')
        if d.endswith('Mexico/General'):
            d = d.replace('Mexico/General', 'CDT')
        if d.endswith('Pacific Daylight Time'):
            d = d.replace('Pacific Daylight Time', 'PDT')
        if d.endswith(' ZE2'):
            d = d.replace(' ZE2', ' +0200')
        if d.find('-Juin-') > 0:
            d = d.replace('-Juin-', '-Jun-')
        if d.find('-Juil-') > 0:
            d = d.replace('-Juil-', '-Jul-')
        if d.find(' 0 (GMT)'):
            d = d.replace(' 0 (GMT)', ' +0000')

        if self._date_multiminus_re.search(d):
            d = self._date_multiminus_re.sub(' \\1', d)

        if self._date_offsetnoplus_re.search(d):
            d = self._date_offsetnoplus_re.sub('+\\1', d)

        # We have a number of dates in the format
        # "<full datespace> +0200 (MET DST)"
        # or similar. The problem coming from the space within the
        # parenthesis, or if the contents of the parenthesis is
        # completely empty
        if self._date_multi_re.search(d):
            d = self._date_multi_re.sub('', d)

        # If the spec is instead
        # "<full datespace> +0200 (...)"
        # of any kind, we can just remove what's in the (), because the
        # parser is just going to rely on the fixed offset anyway.
        if self._date_multi_re2.search(d):
            d = self._date_multi_re2.sub(' \\1', d)

        try:
            dp = dateutil.parser.parse(d, fuzzy=True)

            # Some offsets are >16 hours, which postgresql will not
            # (for good reasons) accept
            if dp.utcoffset() and abs(dp.utcoffset().days * (24 * 60 * 60) + dp.utcoffset().seconds) > 60 * 60 * 16 - 1:
                # Convert it to a UTC timestamp using Python. It will give
                # us the right time, but the wrong timezone. Should be
                # enough...
                dp = datetime.datetime(*dp.utctimetuple()[:6])
            if not dp.tzinfo:
                dp = dp.replace(tzinfo=datetime.timezone.utc)
            return dp
        except Exception as e:
            raise IgnorableException("Failed to parse date '%s': %s" % (d, e))

    def _maybe_decode(self, s, charset):
        if isinstance(s, str):
            return s.strip(' ')
        return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ')

    # Workaround for broken quoting in some MUAs (see below)
    _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE)

    def _decode_mime_header(self, hdr, email_workaround):
        if hdr is None:
            return None

        # Per http://bugs.python.org/issue504152 (and lots of testing), it seems
        # we must get rid of the sequence \n\t at least in the header. If we
        # do this *before* doing any MIME decoding, we should be safe against
        # anybody *actually* putting that sequence in the header (since we
        # won't match the encoded contents)
        hdr = hdr.replace("\n\t", " ")

        # In at least some cases, at least gmail (and possibly other MUAs)
        # incorrectly put double quotes in the name/email field even when
        # it's encoded. That's not allowed - they have to be escaped - but
        # since there's a fair amount of those, we apply a regex to get
        # rid of them.
        m = self._re_mailworkaround.search(hdr)
        if m:
            hdr = self._re_mailworkaround.sub(r'\1', hdr)

        try:
            return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)])
        except HeaderParseError as e:
            # Parser error is typically someone specifying an encoding,
            # but then not actually using that encoding. We'll do the best
            # we can, which is cut it down to ascii and ignore errors
            return str(hdr, 'us-ascii', errors='ignore').strip(' ')

    def decode_mime_header(self, hdr, email_workaround=False):
        try:
            if isinstance(hdr, Header):
                hdr = hdr.encode()

            h = self._decode_mime_header(hdr, email_workaround)
            if h:
                return h.replace("\0", "")
            return ''
        except LookupError as e:
            raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e))
        except ValueError as ve:
            raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve))

    def get_mandatory(self, fieldname):
        try:
            x = self.msg[fieldname]
            if x is None:
                raise Exception()
            return x
        except:
            raise IgnorableException("Mandatory field '%s' is missing" % fieldname)

    def get_optional(self, fieldname):
        try:
            return self.msg[fieldname]
        except:
            return ''

    def html_clean(self, html):
        # First we pass it through tidy
        (html, errors) = tidylib.tidy_document(html,
                                               options={
                                                   'drop-proprietary-attributes': 1,
                                                   'alt-text': '',
                                                   'hide-comments': 1,
                                                   'output-xhtml': 1,
                                                   'show-body-only': 1,
                                                   'clean': 1,
                                                   'char-encoding': 'utf8',
                                                   'show-warnings': 0,
                                                   'show-info': 0,
                                               })
        if errors:
            print(("HTML tidy failed for %s!" % self.msgid))
            print(errors)
            return None

        try:
            cleaner = HTMLCleaner()
            cleaner.feed(html)
            return cleaner.get_text()
        except Exception as e:
            # Failed to parse the html, thus failed to clean it. so we must
            # give up...
            return None
Пример #17
0
class ArchivesParser(object):
    def __init__(self):
        self.parser = BytesParser(policy=compat32)

    def parse(self, stream):
        self.rawtxt = stream.read()
        self.msg = self.parser.parse(io.BytesIO(self.rawtxt))

    def is_msgid(self, msgid):
        # Look for a specific messageid. This means we might parse it twice,
        # but so be it. Any exception means we know it's not this one...
        try:
            if self.clean_messageid(
                    self.decode_mime_header(
                        self.get_mandatory('Message-ID'))) == msgid:
                return True
        except Exception:
            return False

    def analyze(self, date_override=None):
        self.msgid = self.clean_messageid(
            self.decode_mime_header(self.get_mandatory('Message-ID')))
        self._from = self.decode_mime_header(self.get_mandatory('From'), True)
        self.to = self.decode_mime_header(self.get_optional('To'), True)
        self.cc = self.decode_mime_header(self.get_optional('CC'), True)
        self.subject = self.decode_mime_header(self.get_optional('Subject'))
        if date_override:
            self.date = self.forgiving_date_decode(date_override)
        else:
            self.date = self.forgiving_date_decode(
                self.decode_mime_header(self.get_mandatory('Date')))

            # Accept times up to 4 hours in the future, for badly synced clocks
            maxdate = datetime.datetime.now(
                datetime.timezone.utc) + datetime.timedelta(hours=4)
            if self.date > maxdate:
                # Date is in the future, we don't trust that. Instead, let's see if we can find
                # it in the raw text of the message.
                def _extract_date(d):
                    m = _re_received.match(d)
                    if m:
                        try:
                            return self.forgiving_date_decode(
                                m.group(1).strip())
                        except IgnorableException:
                            pass

                lowdate = min(
                    (x
                     for x in map(_extract_date, self.msg.get_all('Received'))
                     if x and x < maxdate))
                if lowdate:
                    self.date = lowdate
                # Else we're going to go with what we found
        self.bodytxt = self.get_body()
        self.attachments = []
        self.get_attachments()
        if len(self.attachments) > 0:
            log.status("Found %s attachments" % len(self.attachments))

        # Build an list of the message id's we are interested in
        self.parents = []
        # The first one is in-reply-to, if it exists
        if self.get_optional('in-reply-to'):
            m = self.clean_messageid(
                self.decode_mime_header(self.get_optional('in-reply-to')),
                True)
            if m:
                self.parents.append(m)

        # Then we add all References values, in backwards order
        if self.get_optional('references'):
            cleaned_msgids = [
                self.clean_messageid(x, True) for x in reversed(
                    self.decode_mime_header(self.get_optional(
                        'references')).split())
            ]
            # Can't do this with a simple self.parents.extend() due to broken
            # mailers that add the same reference more than once. And we can't
            # use a set() to make it unique, because order is very important
            for m in cleaned_msgids:
                if m and m not in self.parents:
                    self.parents.append(m)

    def clean_charset(self, charset):
        lcharset = charset.lower()
        if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown':
            # Special case where we don't know... We'll assume
            # us-ascii and use replacements
            return 'us-ascii'
        if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset':
            # Seriously broken charset definitions, map to us-ascii
            # and throw away the rest with replacements
            return 'us-ascii'
        if lcharset == 'x-gbk':
            # Some MUAs set it to x-gbk, but there is a valid
            # declaratoin as gbk...
            return 'gbk'
        if lcharset == 'iso-8859-8-i':
            # -I is a special logical version, but should be the
            # same charset
            return 'iso-8859-8'
        if lcharset == 'windows-874':
            # This is an alias for iso-8859-11
            return 'iso-8859-11'
        if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1':
            # Strange way of saying 8859....
            return 'iso-8859-1'
        if lcharset == 'iso885915':
            return 'iso-8859-15'
        if lcharset == 'iso-latin-2':
            return 'iso-8859-2'
        if lcharset == 'iso-850':
            # Strange spelling of cp850 (windows charset)
            return 'cp850'
        if lcharset == 'koi8r':
            return 'koi8-r'
        if lcharset == 'cp 1252':
            return 'cp1252'
        if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii':
            # Why did this show up more than once?!
            return 'iso-8859-1'
        if lcharset == 'x-windows-949':
            return 'ms949'
        if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de':
            # This is a locale, and not a charset, but most likely it's this one
            return 'iso-8859-1'
        if lcharset == 'iso-8858-15':
            # How is this a *common* mistake?
            return 'iso-8859-15'
        if lcharset == 'macintosh':
            return 'mac_roman'
        if lcharset == 'cn-big5':
            return 'big5'
        if lcharset == 'x-unicode-2-0-utf-7':
            return 'utf-7'
        if lcharset == 'tscii':
            # No support for this charset :S Map it down to ascii
            # and throw away all the rest. sucks, but we have to
            return 'us-ascii'
        return charset

    def get_payload_as_unicode(self, msg):
        try:
            b = msg.get_payload(decode=True)
        except AssertionError:
            # Badly encoded data can throw an exception here, where the python
            # libraries fail to handle it and enters a cannot-happen path.
            # In which case we just ignore it and hope for a better MIME part later.
            b = None

        if b:
            # Find out if there is a charset
            charset = None
            params = msg.get_params()
            if not params:
                # No content-type, so we assume us-ascii
                return str(b, 'us-ascii', errors='ignore')
            for k, v in params:
                if k.lower() == 'charset':
                    charset = v
                    break
            if charset:
                try:
                    return str(b, self.clean_charset(charset), errors='ignore')
                except LookupError as e:
                    raise IgnorableException(
                        "Failed to get unicode payload: %s" % e)
            else:
                # XXX: reasonable default?
                return str(b, errors='ignore')
        # Return None or empty string, depending on what we got back
        return b

    # Regular expression matching the PostgreSQL custom mail footer that
    # is appended to all emails.
    _re_footer = re.compile(
        r'(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$',
        re.DOTALL)

    def get_body(self):
        b = self._get_body()
        if b:
            # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will
            # later reject..
            if b.find('\udbff\n\udef8'):
                b = b.replace('\udbff\n\udef8', '')

        # Remove postgres specific mail footer - if it's there
        m = self._re_footer.match(b)
        if m:
            b = m.group(1)

        # Sometimes we end up with a trailing \0 when decoding long strings, so
        # replace it if it's there.
        # In fact, replace it everywhere, since it can also turn up in the middle
        # of a text when it's a really broken decoding.
        b = b.replace('\0', '')

        return b

    def _get_body(self):
        # This is where the magic happens - try to figure out what the body
        # of this message should render as.
        hasempty = False

        # First see if this is a single-part message that we can just
        # decode and go.
        b = self.get_payload_as_unicode(self.msg)
        if b:
            return b
        if b == '':
            # We found something, but it was empty. We'll keep looking as
            # there might be something better available, but make a note
            # that empty exists.
            hasempty = True

        # Ok, it's multipart. Find the first part that is text/plain,
        # and use that one. Do this recursively, since we may have something
        # like:
        # multipart/mixed:
        #   multipart/alternative:
        #      text/plain
        #      text/html
        #   application/octet-stream (attachment)
        b = self.recursive_first_plaintext(self.msg)
        if b:
            return b
        if b == '':
            hasempty = True

        # Couldn't find a plaintext. Look for the first HTML in that case.
        # Fallback, but what can we do at this point...
        b = self.recursive_first_plaintext(self.msg, True)
        if b:
            b = self.html_clean(b)
            if b:
                return b
        if b == '' or b is None:
            hasempty = True

        if hasempty:
            log.status('Found empty body in %s' % self.msgid)
            return ''
        raise IgnorableException("Don't know how to read the body from %s" %
                                 self.msgid)

    def recursive_first_plaintext(self, container, html_instead=False):
        pl = container.get_payload()
        if isinstance(pl, str):
            # This was not a multipart, but it leaked... Give up!
            return None
        for p in pl:
            if p.get_params() is None:
                # MIME multipart/mixed, but no MIME type on the part
                log.status(
                    "Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain."
                    % self.msgid)
                return self.get_payload_as_unicode(p)
            if p.get_params()[0][0].lower() == 'text/plain':
                # Don't include it if it looks like an attachment
                if 'Content-Disposition' in p and p[
                        'Content-Disposition'].startswith('attachment'):
                    continue
                t = self.get_payload_as_unicode(p)
                if t:
                    return t
            if html_instead and p.get_params()[0][0].lower() == 'text/html':
                # Don't include it if it looks like an attachment
                if 'Content-Disposition' in p and p[
                        'Content-Disposition'].startswith('attachment'):
                    continue
                t = self.get_payload_as_unicode(p)
                if t:
                    return t
            if p.is_multipart():
                b = self.recursive_first_plaintext(p, html_instead)
                if b or b == '':
                    return b

        # Yikes, nothing here! Hopefully we'll find something when
        # we continue looping at a higher level.
        return None

    def get_attachments(self):
        self.attachments_found_first_plaintext = False
        self.recursive_get_attachments(self.msg)

    # Clean a filenames encoding and return it as a unicode string
    def _clean_filename_encoding(self, filename):
        # If this is a header-encoded filename, start by decoding that
        if filename.startswith('=?'):
            decoded, encoding = decode_header(filename)[0]
            return str(decoded, encoding, errors='ignore')

        # If it's already unicode, just return it
        if isinstance(filename, str):
            return filename

        # Anything that's not UTF8, we just get rid of. We can live with
        # filenames slightly mangled in this case.
        return str(filename, 'utf-8', errors='ignore')

    def _extract_filename(self, container):
        # Try to get the filename for an attachment in the container.
        # If the standard library can figure one out, use that one.
        f = container.get_filename()
        if f:
            return self._clean_filename_encoding(f)

        # Failing that, some mailers set Content-Description to the
        # filename
        if 'Content-Description' in container:
            return self._clean_filename_encoding(
                container['Content-Description'])
        return None

    def recursive_get_attachments(self, container):
        # We start recursion in the "multipart" container if any
        if container.get_content_type(
        ) == 'multipart/mixed' or container.get_content_type(
        ) == 'multipart/signed':
            # Multipart - worth scanning into
            if not container.is_multipart():
                # Wow, this is broken. It's multipart/mixed, but doesn't
                # contain multiple parts.
                # Since we're just looking for attachments, let's just
                # ignore it...
                return
            for p in container.get_payload():
                if p.get_params() is None:
                    continue
                self.recursive_get_attachments(p)
        elif container.get_content_type() == 'multipart/alternative':
            # Alternative is not an attachment (we decide)
            # It's typilcally plantext + html
            self.attachments_found_first_plaintext = True
            return
        elif container.is_multipart():
            # Other kinds of multipart, such as multipart/signed...
            return
        else:
            # Not a multipart.
            # Exclude specific contenttypes
            if container.get_content_type() == 'application/pgp-signature':
                return
            if container.get_content_type() in (
                    'application/pkcs7-signature',
                    'application/x-pkcs7-signature'):
                return
            # For now, accept anything not text/plain
            if container.get_content_type() != 'text/plain':
                try:
                    self.attachments.append(
                        (self._extract_filename(container),
                         container.get_content_type(),
                         container.get_payload(decode=True)))
                except AssertionError:
                    # Badly encoded data can throw an exception here, where the python
                    # libraries fail to handle it and enters a cannot-happen path.
                    # In which case we just ignore this attachment.
                    return
                return

            # It's a text/plain, it might be worthwhile.
            # If it has a name, we consider it an attachments
            if not container.get_params():
                return
            for k, v in container.get_params():
                if k == 'name' and v != '':
                    # Yes, it has a name
                    try:
                        self.attachments.append(
                            (self._extract_filename(container),
                             container.get_content_type(),
                             container.get_payload(decode=True)))
                    except AssertionError:
                        # Badly encoded data can throw an exception here, where the python
                        # libraries fail to handle it and enters a cannot-happen path.
                        # In which case we just ignore this attachment.
                        return

                    return

            # If it's content-disposition=attachment, we also want to save it
            if 'Content-Disposition' in container and container[
                    'Content-Disposition'].startswith('attachment'):
                try:
                    self.attachments.append(
                        (self._extract_filename(container),
                         container.get_content_type(),
                         container.get_payload(decode=True)))
                except AssertionError:
                    # Badly encoded data can throw an exception here, where the python
                    # libraries fail to handle it and enters a cannot-happen path.
                    # In which case we just ignore this attachment.
                    return

                return

            # If we have already found one text/plain part, make all
            # further text/plain parts attachments
            if self.attachments_found_first_plaintext:
                # However, this will also *always* catch the MIME part added
                # by majordomo with the footer. So if that one is present,
                # we need to explicitly exclude it again.
                try:
                    b = container.get_payload(decode=True)
                except AssertionError:
                    # Badly encoded data can throw an exception here, where the python
                    # libraries fail to handle it and enters a cannot-happen path.
                    # In which case we just ignore this attachment.
                    return

                if isinstance(b, str) and not self._re_footer.match(b):
                    # We know there is no name for this one
                    self.attachments.append(
                        (None, container.get_content_type(), b))
                return

            # Ok, so this was a plaintext that we ignored. Set the flag
            # that we have now ignored one, so we'll make the next one
            # an attachment.
            self.attachments_found_first_plaintext = True
            # No name, and text/plain, so ignore it

    re_msgid = re.compile(r'^\s*<(.*)>\s*')

    def clean_messageid(self, messageid, ignorebroken=False):
        m = self.re_msgid.match(messageid)
        if not m:
            if ignorebroken:
                log.status("Could not parse messageid '%s', ignoring it" %
                           messageid)
                return None
            raise IgnorableException("Could not parse message id '%s'" %
                                     messageid)
        return m.groups(1)[0].replace(' ', '')


#    _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$')
# Now using [^\s] instead of \w, to work with japanese chars

    _date_multi_re = re.compile(r' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$')
    _date_multi_re2 = re.compile(r' ([\+-]\d{4}) \([^)]+\)$')
    _date_multiminus_re = re.compile(r' -(-\d+)$')
    _date_offsetnoplus_re = re.compile(r' (\d{4})$')

    def forgiving_date_decode(self, d):
        if d.strip() == '':
            raise IgnorableException("Failed to parse empty date")
        # Strange timezones requiring manual adjustments
        if d.endswith('-7700 (EST)'):
            d = d.replace('-7700 (EST)', 'EST')
        if d.endswith('+6700 (EST)'):
            d = d.replace('+6700 (EST)', 'EST')
        if d.endswith('+-4-30'):
            d = d.replace('+-4-30', '+0430')
        if d.endswith('+1.00'):
            d = d.replace('+1.00', '+0100')
        if d.endswith('+-100'):
            d = d.replace('+-100', '+0100')
        if d.endswith('+500'):
            d = d.replace('+500', '+0500')
        if d.endswith('-500'):
            d = d.replace('-500', '-0500')
        if d.endswith('-700'):
            d = d.replace('-700', '-0700')
        if d.endswith('-800'):
            d = d.replace('-800', '-0800')
        if d.endswith('+05-30'):
            d = d.replace('+05-30', '+0530')
        if d.endswith('+0-900'):
            d = d.replace('+0-900', '-0900')
        if d.endswith('Mexico/General'):
            d = d.replace('Mexico/General', 'CDT')
        if d.endswith('Pacific Daylight Time'):
            d = d.replace('Pacific Daylight Time', 'PDT')
        if d.endswith(' ZE2'):
            d = d.replace(' ZE2', ' +0200')
        if d.find('-Juin-') > 0:
            d = d.replace('-Juin-', '-Jun-')
        if d.find('-Juil-') > 0:
            d = d.replace('-Juil-', '-Jul-')
        if d.find(' 0 (GMT)'):
            d = d.replace(' 0 (GMT)', ' +0000')

        if self._date_multiminus_re.search(d):
            d = self._date_multiminus_re.sub(' \\1', d)

        if self._date_offsetnoplus_re.search(d):
            d = self._date_offsetnoplus_re.sub('+\\1', d)

        # We have a number of dates in the format
        # "<full datespace> +0200 (MET DST)"
        # or similar. The problem coming from the space within the
        # parenthesis, or if the contents of the parenthesis is
        # completely empty
        if self._date_multi_re.search(d):
            d = self._date_multi_re.sub('', d)

        # If the spec is instead
        # "<full datespace> +0200 (...)"
        # of any kind, we can just remove what's in the (), because the
        # parser is just going to rely on the fixed offset anyway.
        if self._date_multi_re2.search(d):
            d = self._date_multi_re2.sub(' \\1', d)

        try:
            dp = dateutil.parser.parse(d, fuzzy=True)

            # Some offsets are >16 hours, which postgresql will not
            # (for good reasons) accept
            if dp.utcoffset() and abs(
                    dp.utcoffset().days *
                (24 * 60 * 60) + dp.utcoffset().seconds) > 60 * 60 * 16 - 1:
                # Convert it to a UTC timestamp using Python. It will give
                # us the right time, but the wrong timezone. Should be
                # enough...
                dp = datetime.datetime(*dp.utctimetuple()[:6])
            if not dp.tzinfo:
                dp = dp.replace(tzinfo=datetime.timezone.utc)
            return dp
        except Exception as e:
            raise IgnorableException("Failed to parse date '%s': %s" % (d, e))

    def _maybe_decode(self, s, charset):
        if isinstance(s, str):
            return s.strip(' ')
        return str(s,
                   charset and self.clean_charset(charset) or 'us-ascii',
                   errors='ignore').strip(' ')

    # Workaround for broken quoting in some MUAs (see below)
    _re_mailworkaround = re.compile(r'"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"',
                                    re.IGNORECASE)

    def _decode_mime_header(self, hdr, email_workaround):
        if hdr is None:
            return None

        # Per http://bugs.python.org/issue504152 (and lots of testing), it seems
        # we must get rid of the sequence \n\t at least in the header. If we
        # do this *before* doing any MIME decoding, we should be safe against
        # anybody *actually* putting that sequence in the header (since we
        # won't match the encoded contents)
        hdr = hdr.replace("\n\t", " ")

        # In at least some cases, at least gmail (and possibly other MUAs)
        # incorrectly put double quotes in the name/email field even when
        # it's encoded. That's not allowed - they have to be escaped - but
        # since there's a fair amount of those, we apply a regex to get
        # rid of them.
        m = self._re_mailworkaround.search(hdr)
        if m:
            hdr = self._re_mailworkaround.sub(r'\1', hdr)

        try:
            return " ".join([
                self._maybe_decode(s, charset)
                for s, charset in decode_header(hdr)
            ])
        except HeaderParseError:
            # Parser error is typically someone specifying an encoding,
            # but then not actually using that encoding. We'll do the best
            # we can, which is cut it down to ascii and ignore errors
            return str(hdr, 'us-ascii', errors='ignore').strip(' ')

    def decode_mime_header(self, hdr, email_workaround=False):
        try:
            if isinstance(hdr, Header):
                hdr = hdr.encode()

            h = self._decode_mime_header(hdr, email_workaround)
            if h:
                return h.replace("\0", "")
            return ''
        except LookupError as e:
            raise IgnorableException("Failed to decode header value '%s': %s" %
                                     (hdr, e))
        except ValueError as ve:
            raise IgnorableException("Failed to decode header value '%s': %s" %
                                     (hdr, ve))

    def get_mandatory(self, fieldname):
        try:
            x = self.msg[fieldname]
            if x is None:
                raise Exception()
            return x
        except Exception:
            raise IgnorableException("Mandatory field '%s' is missing" %
                                     fieldname)

    def get_optional(self, fieldname):
        try:
            return self.msg[fieldname]
        except Exception:
            return ''

    def html_clean(self, html):
        # First we pass it through tidy
        (html,
         errors) = tidylib.tidy_document(html,
                                         options={
                                             'drop-proprietary-attributes': 1,
                                             'alt-text': '',
                                             'hide-comments': 1,
                                             'output-xhtml': 1,
                                             'show-body-only': 1,
                                             'clean': 1,
                                             'char-encoding': 'utf8',
                                             'show-warnings': 0,
                                             'show-info': 0,
                                         })
        if errors:
            print(("HTML tidy failed for %s!" % self.msgid))
            print(errors)
            return None

        try:
            cleaner = HTMLCleaner()
            cleaner.feed(html)
            return cleaner.get_text()
        except Exception:
            # Failed to parse the html, thus failed to clean it. so we must
            # give up...
            return None
Пример #18
0
class EmailMessage(object):
    '''
        Email Message.

        Messages should be converted to EmailMessage as soon as possible,
        to check whether the message is parsable as part of validating input.

        If a MIME message is not parsable, a new Message will be created that does conform
        and contains the original unparsable message in the body.
    '''

    DEBUGGING = False

    def __init__(self, message_or_file=None):
        '''
             Creates an EmailMessage from a Message or a file.
             Non-mime messages are converted to MIME "text/plain".

             >>> email_message = EmailMessage()
             >>> type(email_message)
             <class 'goodcrypto.mail.message.email_message.EmailMessage'>
        '''

        self.bad_header_lines = []
        self.parser = Parser()

        self._last_charset = constants.DEFAULT_CHAR_SET
        self._log = self._message = None

        if message_or_file is None:
            self._message = Message()

        elif isinstance(message_or_file, Message):
            self._message = message_or_file

        elif isinstance(message_or_file, EmailMessage):
            self._message = message_or_file.get_message()

        else:
            try:
                if isinstance(message_or_file, IOBase)  or isinstance(message_or_file, StringIO):
                    self.log_message('about to parse a message from a file')
                    try:
                        self._message = self.parser.parse(message_or_file)
                        self.log_message('parsed message from file')
                    except TypeError:
                        message_or_file.seek(0, os.SEEK_SET)
                        self.parser = BytesParser()
                        self._message = self.parser.parse(message_or_file)
                        self.log_message('parsed message from file as bytes')
                else:
                    try:
                        self.log_message('about to parse a message from a string')
                        self._message = self.parser.parsestr(message_or_file)
                        self.log_message('parsed message from string')
                    except TypeError:
                        self.parser = BytesParser()
                        self._message = self.parser.parsebytes(message_or_file)
                        self.log_message('parsed message from bytes')

                if not self.validate_message():
                    self._create_good_message_from_bad(message_or_file)
            except Exception:
                try:
                    self.log_message('EXCEPTION - see syr.exception.log for details')
                    record_exception()

                    self._create_good_message_from_bad(message_or_file)

                    # if we still don't have a good message, then blow up
                    if not self.validate_message():
                        self.log_message('unable to create a valid message')
                        raise MessageException()
                except Exception:
                    record_exception()

        if self.DEBUGGING:
            try:
                self.log_message(self.to_string())
            except:
                pass


    def get_header(self, key):
        '''
            Get a header from an existing message.

            >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name
            >>> with open(get_encrypted_message_name('basic.txt')) as input_file:
            ...     email_message = EmailMessage(input_file)
            ...     crypto_software = email_message.get_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER)
            >>> crypto_software == 'GPG'
            True
        '''

        try:
            value = self.get_message().__getitem__(key)
        except Exception:
            value = None

        return value


    def add_header(self, key, value):
        '''
            Add a header to an existing message.

            >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name
            >>> with open(get_plain_message_name('basic.txt')) as input_file:
            ...     email_message = EmailMessage(input_file)
            ...     email_message.add_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER, 'GPG')
            ...     crypto_software = email_message.get_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER)
            >>> crypto_software == 'GPG'
            True
        '''

        self._message.__setitem__(key, value)


    def change_header(self, key, value):
        '''
            Change a header to an existing message.

            >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name
            >>> with open(get_encrypted_message_name('bouncy-castle.txt')) as input_file:
            ...     email_message = EmailMessage(input_file)
            ...     email_message.change_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER, 'TestGPG')
            ...     crypto_software = email_message.get_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER)
            >>> crypto_software == 'TestGPG'
            True
        '''

        if key in self._message:
            self._message.replace_header(key, value)
        else:
            self.add_header(key, value)


    def delete_header(self, key):
        '''
            Delete a header to an existing message.

            >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name
            >>> with open(get_encrypted_message_name('bouncy-castle.txt')) as input_file:
            ...     email_message = EmailMessage(input_file)
            ...     email_message.delete_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER)
            ...     email_message.get_header(constants.ACCEPTED_CRYPTO_SOFTWARE_HEADER) is None
            True
        '''

        self._message.__delitem__(key)


    def get_message(self):
        '''
            Get the message.

            >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message
            >>> from goodcrypto.oce.test_constants import EDWARD_LOCAL_USER
            >>> email_message = get_basic_email_message()
            >>> email_message.get_message() is not None
            True
            >>> email_message.get_message().get(mime_constants.FROM_KEYWORD) == EDWARD_LOCAL_USER
            True
        '''

        return self._message


    def set_message(self, new_message):
        '''
            Set the new message.

            # Get a basic message first so we can avoid recursion
            >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message
            >>> from goodcrypto.oce.test_constants import EDWARD_LOCAL_USER
            >>> basic_email_message = get_basic_email_message().get_message()
            >>> email_message = EmailMessage()
            >>> email_message.get_message().get(mime_constants.FROM_KEYWORD) is None
            True
            >>> email_message.set_message(basic_email_message)
            >>> email_message.get_message().get(mime_constants.FROM_KEYWORD) == EDWARD_LOCAL_USER
            True
        '''

        old_message = self._message

        if is_string(new_message):
            try:
                if isinstance(self.parser, Parser):
                    self._message = self.parser.parsestr(new_message)
                else:
                    self._message = self.parser.parsebytes(new_message.encode())
            except:
                self._message = old_message
                record_exception()
        else:
            self._message = new_message

        # restore the old message if the new one isn't valid.
        if not self.validate_message():
            self._message = old_message
            self.log_message('restored previous message')

    def validate_message(self):
        '''
            Validate a message.

            Python's parser frequently accepts a message that has garbage in the header by
            simply adding all header items after the bad header line(s) to the body text;
            this can leave a pretty unmanageable message so we apply our own validation.

            >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message
            >>> from goodcrypto.oce.test_constants import EDWARD_LOCAL_USER
            >>> email_message = get_basic_email_message()
            >>> email_message.validate_message()
            True
        '''
        try:
            validator = Validator(self)
            if validator.is_message_valid():
                valid = True
                self.log_message('message is valid')
            else:
                valid = False
                self.log_message('message is invalid')
                self.log_message(validator.get_why())
        except Exception as AttributeError:
            valid = False
            record_exception()

        return valid

    def get_text(self):
        '''
            Gets text from the current Message.

            This method works with both plain and MIME messages, except open pgp mime.
            If the message is MIMEMultipart, the text is from the first text/plain part.

            >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message
            >>> email_message = get_basic_email_message()
            >>> text = email_message.get_text()
            >>> text == 'Test message text'
            True
        '''

        text = None
        message = self.get_message()

        if is_open_pgp_mime(message):
            self.log_message("unable to get text from openpgp mime message")

        else:
            if message.is_multipart():
                self.log_message("message is a MIMEMultipart")

                #  get the first text/plain part
                result_ok = False
                part_index = 0
                parts = message.get_payload()
                while part_index < len(parts) and not result_ok:
                    part = message.get_payload(part_index)
                    content_type = part.get_content_type()
                    if content_type == mime_constants.TEXT_PLAIN_TYPE:
                        text = self._get_decoded_payload(part)
                        result_ok = True
                    else:
                        self.log_message("body part type is " + content_type)
                    part_index += 1
            else:
                text = self._get_decoded_payload(message)
                self.log_message("payload is a: {}".format(type(text)))

        return text


    def set_text(self, text, charset=None):
        '''
            Sets text in the current Message.

            This method works with both plain and MIME messages, except open pgp mime.
            If the message is MIMEMultipart, the text is set in the first text/plain part.

            >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message
            >>> email_message = get_basic_email_message()
            >>> email_message.set_text('New test message text')
            True
            >>> text = email_message.get_text()
            >>> text == 'New test message text'
            True
        '''

        if self.DEBUGGING: self.log_message("setting text:\n{}".format(text))

        text_set = False
        message = self.get_message()
        if message.is_multipart():
            #  set the first text/plain part
            text_set = False
            part_index = 0
            parts = message.get_payload()
            while part_index < len(parts) and not text_set:
                part = message.get_payload(part_index)
                content_type = part.get_content_type()
                if content_type == mime_constants.TEXT_PLAIN_TYPE:
                    part.set_payload(text)
                    text_set = True
                    self.log_message('the first text/plain part found')
                else:
                    self.log_message('body part type is {}'.format(content_type))
                part_index += 1

            if not text_set:
                charset, __ = get_charset(self._message, self._last_charset)
                self.log_message('no text_set char set: {}'.format(charset))
                new_part = MIMEText(text, mime_constants.PLAIN_SUB_TYPE, charset)
                message.attach(new_part)
                text_set = True
                self.log_message('added a new text/plain part with text')

        elif is_open_pgp_mime(message):
            self.log_message("unable to set text from openpgp mime message")

        else:
            self.set_content(text, mime_constants.TEXT_PLAIN_TYPE, charset=charset)
            text_set = True

        if self.DEBUGGING:
            self.log_message("message after setting text:\n" + self.to_string())
            self.log_message("set text:\n{}".format(text_set))

        return text_set


    def get_content(self):
        '''
            Get the message's content, decoding if bas64 or print-quoted encoded.

            >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message
            >>> email_message = get_basic_email_message()
            >>> text = email_message.get_content()
            >>> text == 'Test message text'
            True
        '''

        decode = False
        msg = self.get_message()
        encoding = self.get_header(mime_constants.CONTENT_XFER_ENCODING_KEYWORD)
        if encoding is not None:
            encoding = encoding.lower()
            self.log_message('payloaded encoded with {}'.format(encoding))

            # only use the encoding if it's not a multipart message
            if (encoding == mime_constants.QUOTED_PRINTABLE_ENCODING or
                encoding == mime_constants.BASE64_ENCODING):
                current_content_type = self.get_message().get_content_type()
                if (current_content_type is not None and
                    current_content_type.lower().find(mime_constants.MULTIPART_PRIMARY_TYPE) < 0):
                    decode = True
                    self.log_message('decoding payload with {}'.format(encoding))

        try:
            payload = self._get_decoded_payload(self.get_message(), decode=decode)
            if self.DEBUGGING: self.log_message('decoded payloaded:\n{}'.format(payload))
            self.log_message('type of payload: {}'.format(type(payload)))
        except:
            record_exception()
            payload = message.get_payload()

        return payload

    def set_content(self, payload, content_type, charset=None):
        '''
            Set the content of the message.

            >>> from goodcrypto_tests.mail.message_utils import get_basic_email_message
            >>> email_message = get_basic_email_message()
            >>> email_message.set_content('New test message text', mime_constants.TEXT_PLAIN_TYPE)
            >>> text = email_message.get_content()
            >>> text == 'New test message text'
            True
        '''

        # create a new message if one doesn't exist
        if self._message is None:
            self._message = Message()

        current_content_type = self.get_message().get_content_type()
        if current_content_type is None:
            current_content_type = content_type
        self.log_message('current content type: {}'.format(current_content_type))
        self.log_message('setting content type: {}'.format(content_type))
        if self.DEBUGGING: self.log_message('content:\n{}'.format(payload))

        current_encoding = self.get_header(mime_constants.CONTENT_XFER_ENCODING_KEYWORD)
        if current_encoding is None:
            self._message.__setitem__(mime_constants.CONTENT_XFER_ENCODING_KEYWORD, mime_constants.BITS_8)
            self.log_message('setting content encoding: {}'.format(mime_constants.BITS_8))

        # if this is a simple text or html message, then just update the payload
        if (content_type == current_content_type and
            (content_type == mime_constants.TEXT_PLAIN_TYPE or
             content_type == mime_constants.TEXT_HTML_TYPE)):

            if charset is None:
                charset, self._last_charset = get_charset(payload, self._last_charset)
                self.log_message('getting charset from payload: {}'.format(charset))
            elif self._last_charset is None:
                self._last_charset = constants.DEFAULT_CHAR_SET
                self.log_message('setting last charset to default: {}'.format())
            else:
                self.log_message('using preset charset: {}'.format(charset))

            try:
                self.get_message().set_payload(
                   self.encode_payload(payload, current_encoding), charset=charset)
                self.log_message('set payload with {} charset'.format(charset))
                if self.DEBUGGING: self.log_message('payload set:\n{}'.format(payload))
            except UnicodeEncodeError as error:
                self.log_message(error.reason)
                self.log_message('start: {} end: {}'.format(error.start, error.end))
                self.log_message('object: {}'.format(error.object))
                self.get_message().set_payload(self.encode_payload(payload, current_encoding))
                self.log_message('setting payload without charset')
            self.get_message().set_type(content_type)

        else:
            from goodcrypto.mail.message.inspect_utils import is_content_type_mime

            self.log_message('attaching payload for {}'.format(content_type))
            if content_type == mime_constants.OCTET_STREAM_TYPE:
                part = MIMEBase(mime_constants.APPLICATION_TYPE, mime_constants.OCTET_STREAM_SUB_TYPE)
                part.set_payload(open(payload,"rb").read())
                encode_base64(part)
                part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(payload))
                self.get_message().attach(part)

            elif is_content_type_mime(self.get_message()):
                if not self.get_message().is_multipart():
                    if charset is None:
                        charset, self._last_charset = get_charset(payload, self._last_charset)
                        self.log_message('setting content with char set: {}'.format(charset))
                    else:
                        if self._last_charset is None:
                            self._last_charset = constants.DEFAULT_CHAR_SET
                    self.get_message().set_payload(self.encode_payload(payload, current_encoding), charset)
                    self.log_message('set payload with {} charset'.format(charset))
                    self.get_message().set_type(content_type)

                elif content_type == mime_constants.TEXT_PLAIN_TYPE:
                    if self.DEBUGGING: self.log_message('mime text payload:\n{}'.format(payload))
                    part = MIMEText(payload)
                    if self.DEBUGGING: self.log_message('mime text part:\n{}'.format(part))
                    part.set_payload(self.encode_payload(payload, current_encoding))
                    if self.DEBUGGING: self.log_message('mime text part with payload:\n{}'.format(part))
                    self.get_message().attach(part)

                else:
                    primary, __, secondary = content_type.partition(mime_constants.PRIMARY_TYPE_DELIMITER)
                    part = MIMEBase(primary, secondary)
                    part.set_payload(self.encode_payload(payload, current_encoding))
                    self.get_message().attach(part)

    def encode_payload(self, payload, current_encoding):
        '''
            Encode the payload.

            Test extreme case.
            >>> email_message = EmailMessage()
            >>> email_message.encode_payload(None, None)
        '''
        new_payload = payload
        if payload is not None and current_encoding is not None:
            """
            """
            if current_encoding == mime_constants.BASE64_ENCODING:
                if isinstance(payload, str):
                    payload = payload.encode()
                new_payload = b64encode(payload)
                self.log_message('encoding payload with {}'.format(current_encoding))
            elif current_encoding == mime_constants.QUOTED_PRINTABLE_ENCODING:
                if isinstance(payload, str):
                    payload = payload.encode()
                new_payload = encodestring(payload)
                self.log_message('encoding payload with {}'.format(current_encoding))
        return new_payload

    def is_probably_pgp(self):
        '''
            Returns true if this is probably an OpenPGP message.

            >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name
            >>> with open(get_encrypted_message_name('open-pgp-mime.txt')) as input_file:
            ...     mime_message = EmailMessage(input_file)
            ...     mime_message.is_probably_pgp()
            True
        '''

        is_pgp = is_open_pgp_mime(self.get_message())
        if not is_pgp:
            content = self.get_content()
            if is_string(content):
                is_pgp = self.contains_pgp_message_delimters(content)
                self.log_message('message uses in line pgp: {}'.format(is_pgp))
            elif isinstance(content, list):
                for part in content:
                    if isinstance(part, Message):
                        part_content = part.get_payload()
                    else:
                        part_content = part

                    if is_string(part_content):
                        is_pgp = self.contains_pgp_message_delimters(part_content)
                        if is_pgp:
                            self.log_message('part of message uses in line pgp: {}'.format(is_pgp))
                            break
                    else:
                        self.log_message('part of content type is: {}'.format(repr(part_content)))
            else:
                self.log_message('content type is: {}'.format(type(content)))

        return is_pgp

    def contains_pgp_message_delimters(self, text):
        '''
            Returns true if text contains PGP message delimiters.

            >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name
            >>> with open(get_encrypted_message_name('open-pgp-mime.txt')) as input_file:
            ...     text = input_file.read()
            ...     email_message = EmailMessage()
            ...     email_message.contains_pgp_message_delimters(text)
            True
        '''

        return (isinstance(text, str) and
                text.find(oce_constants.BEGIN_PGP_MESSAGE) >= 0 and
                text.find(oce_constants.END_PGP_MESSAGE) >= 0)

    def contains_pgp_signature_delimeters(self, text):
        '''
            Returns true if text contains PGP signature delimiters.

            >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name
            >>> with open(get_plain_message_name('pgp-signature.txt')) as input_file:
            ...     text = input_file.read()
            ...     email_message = EmailMessage()
            ...     email_message.contains_pgp_signature_delimeters(text)
            True
        '''

        return (isinstance(text, str) and
                text.find(oce_constants.BEGIN_PGP_SIGNATURE) >= 0 and
                text.find(oce_constants.END_PGP_SIGNATURE) >= 0)

    def get_pgp_signature_blocks(self):
        '''
            Returns the PGP signature blocks with text, if there are any.

            >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name
            >>> with open(get_plain_message_name('pgp-signature.txt')) as input_file:
            ...     mime_message = EmailMessage(input_file)
            ...     signature_blocks = mime_message.get_pgp_signature_blocks()
            ...     len(signature_blocks) > 0
            True
        '''

        def get_signed_data(content):
            ''' Get the signed data. '''

            signature_block = None
            start_index = content.find(oce_constants.BEGIN_PGP_SIGNED_MESSAGE)
            if start_index < 0:
                start_index = content.find(oce_constants.BEGIN_PGP_SIGNATURE)
            end_index = content.find(oce_constants.END_PGP_SIGNATURE)
            if start_index >= 0 and end_index > start_index:
                signature_block = content[start_index:end_index + len(oce_constants.END_PGP_SIGNATURE)]

            return signature_block

        signature_blocks = []
        if self.get_message().is_multipart():
            self.log_message('check each of {} parts of message for a signature'.format(
                len(self.get_message().get_payload())))
            part_index = 0
            parts = self.get_message().get_payload()
            for part in parts:
                part_index += 1
                if isinstance(part, str):
                    content = part
                else:
                    content = part.get_payload()
                if self.contains_pgp_signature_delimeters(content):
                    is_signed = True
                    signature_block = get_signed_data(content)
                    if signature_block is not None:
                        signature_blocks.append(signature_block)
                    self.log_message('found signature block in part {}'.format(part_index))
                part_index += 1

        else:
            content = self._get_decoded_payload(self.get_message())
            if isinstance(content, str) and self.contains_pgp_signature_delimeters(content):
                is_signed = True
                signature_block = get_signed_data(content)
                if signature_block is not None:
                    signature_blocks.append(signature_block)
                    self.log_message('found signature block in content')

        self.log_message('total signature blocks: {}'.format(len(signature_blocks)))

        return signature_blocks

    def remove_pgp_signature_blocks(self):
        '''
            Remove the PGP signature blocks, if there are any.

            >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name
            >>> with open(get_plain_message_name('pgp-signature.txt')) as input_file:
            ...     mime_message = EmailMessage(input_file)
            ...     mime_message.remove_pgp_signature_blocks()
            ...     signature_blocks = mime_message.get_pgp_signature_blocks()
            ...     len(signature_blocks) == 0
            True
        '''

        def remove_signature(content):
            ''' Remove the signature from the content. '''

            # remove the beginning signature lines
            if content.startswith(oce_constants.BEGIN_PGP_SIGNED_MESSAGE):
                begin_sig_lines = ''
                for line in content.split('\n'):
                    if len(line.strip()) <= 0:
                        break
                    else:
                        begin_sig_lines += '{}\n'.format(line)
                content = content[len(begin_sig_lines):]


            # remove the signature itself
            start_index = content.find(oce_constants.BEGIN_PGP_SIGNATURE)
            end_index = content.find(oce_constants.END_PGP_SIGNATURE)
            content = content[0:start_index] + content[end_index + len(oce_constants.END_PGP_SIGNATURE):]

            # remove the extra characters added around the message itself
            content = content.replace('- {}'.format(oce_constants.BEGIN_PGP_MESSAGE), oce_constants.BEGIN_PGP_MESSAGE)
            content = content.replace('- {}'.format(oce_constants.END_PGP_MESSAGE), oce_constants.END_PGP_MESSAGE)

            return content

        try:
            if self.get_message().is_multipart():
                self.log_message('check each of {} parts of message for a signature'.format(
                    len(self.get_message().get_payload())))
                part_index = 0
                parts = self.get_message().get_payload()
                for part in parts:
                    part_index += 1
                    if isinstance(part, str):
                        content = part
                    else:
                        content = self._get_decoded_payload(part)
                    if self.contains_pgp_signature_delimeters(content):
                        charset, __ = get_charset(part)
                        self.log_message('set payload after removing sig with char set: {}'.format(charset))
                        part.set_payload(remove_signature(content), charset=charset)
                        self.log_message('extracted signature block from part {}'.format(part_index))

            else:
                content = self._get_decoded_payload(self.get_message())
                if isinstance(content, str) and self.contains_pgp_signature_delimeters(content):
                    charset, __ = get_charset(part)
                    self.get_message().set_payload(remove_signature(content), charset=charset)
                    self.log_message('extracted signature block from content with char set: {}'.format(charset))
        except:
            self.log_message('EXCEPTION see syr.exception.log')
            record_exception()

    def write_to(self, output_file):
        '''
            Write message to the specified file.

            >>> from goodcrypto.mail.utils.dirs import get_test_directory
            >>> from goodcrypto_tests.mail.message_utils import get_encrypted_message_name
            >>> filename = get_encrypted_message_name('iso-8859-1-binary.txt')
            >>> with open(filename) as input_file:
            ...     output_dir = get_test_directory()
            ...     output_filename = os.path.join(output_dir, 'test-message.txt')
            ...     mime_message = EmailMessage(input_file)
            ...     with open(output_filename, 'w') as out:
            ...         mime_message.write_to(out)
            ...         os.path.exists(output_filename)
            ...         mime_message.write_to(out)
            ...     os.path.exists(output_filename)
            ...     os.remove(output_filename)
            True
            True
            True
            True

            if os.path.exists(output_filename):
                os.remove(output_filename)
        '''

        result_ok = False
        try:
            if isinstance(output_file, IOBase):
                if output_file.closed:
                    with open(output_file.name, 'w') as out:
                        out.write(self.to_string())
                        out.flush()
                else:
                    output_file.write(self.to_string())
                    output_file.flush()

            elif isinstance(output_file, StringIO):
                output_file.write(self.to_string())

            else:
                with open(output_file, 'w') as out:
                    out.write(self.to_string())
                    out.flush()

            result_ok = True
        except Exception:
            record_exception()
            raise Exception

        return result_ok


    def to_string(self, charset=None, mangle_from=False):
        '''
            Convert message to a string.

            >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name
            >>> filename = get_plain_message_name('basic.txt')
            >>> with open(filename) as input_file:
            ...     file_content = input_file.read().replace('\\r\\n', '\\n')
            ...     position = input_file.seek(os.SEEK_SET)
            ...     email_message = EmailMessage(input_file)
            ...     file_content.strip() == email_message.to_string().strip()
            True
        '''

        string = None

        try:
            msg = self._message
            if charset is None:
                charset, __ = get_charset(msg, self._last_charset)
                self.log_message('char set in to_string(): {}'.format(charset))

            #  convert the message
            try:
                file_pointer = StringIO()
                message_generator = Generator(file_pointer, mangle_from_=mangle_from, maxheaderlen=78)
                message_generator.flatten(msg)
                string = file_pointer.getvalue()
            except Exception as AttributeError:
                try:
                    self.log_message('unable to flatten message')
                    record_exception(AttributeError)

                    msg = self._message
                    string = msg.as_string()
                except Exception as AttributeError:
                    #  we explicitly want to catch everything here, even NPE
                    self.log_message('unable to convert message as_string')

                    string = '{}\n\n{}'.format(
                        '\n'.join(self.get_header_lines()),
                        '\n'.join(self.get_content_lines()))

                    if self.DEBUGGING: self.log_message("message string:\n{}".format(string))

        except IOError as io_error:
            self.last_error = io_error
            self.log_message(io_error)

        except MessageException as msg_exception:
            self.last_error = msg_exception
            self.log_message(msg_exception)

        return string


    def get_header_lines(self):
        '''
            Get message headers as a list of lines.

            The lines follow RFC 2822, with a maximum of 998 characters per line.
            Longer headers are folded using a leading tab.

            >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name
            >>> filename = get_plain_message_name('basic.txt')
            >>> with open(filename) as input_file:
            ...     email_message = EmailMessage(input_file)
            ...     len(email_message.get_header_lines()) > 0
            True
        '''

        max_line_length = 998

        lines = []
        keys = self._message.keys()
        for key in keys:
            value = self.get_header(key)
            if value is None:
                value = ''
            raw_line = '{}: {}'.format(key, value)
            if len(raw_line) > max_line_length:

                #  add first line from this header
                part_line = raw_line[0:max_line_length]
                lines.append(part_line)
                raw_line = raw_line[:max_line_length]

                #  add continuation lines
                while len(raw_line) > max_line_length:
                    #  make space for leading tab
                    part_line = raw_line[0:max_line_length - 1]
                    lines.append("\t" + part_line)
                    raw_line = raw_line[max_line_length - 1:]

            if len(raw_line) > 0:
                lines.append(raw_line)

        return lines


    def get_content_lines(self):
        '''
            Gets the message content as a list of lines.

            This is the part of the message after the header and the separating blank
            line, with no decoding.

            >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name
            >>> filename = get_plain_message_name('basic.txt')
            >>> with open(filename) as input_file:
            ...     email_message = EmailMessage(input_file)
            ...     len(email_message.get_content_lines()) > 0
            True
        '''

        lines = []
        payloads = self._message.get_payload()
        if payloads is None:
            self.log_message('No content')
        else:
            if isinstance(payloads, str):
                lines = payloads.split('\n')
            else:
                for payload in payloads:
                    if isinstance(payload, Message):
                        lines += payload.as_string()
                    else:
                        lines += payload.split('\n')

        return lines

    def _parse_header_line(self, line, last_name):
        '''
            Parse a header line (internal user only).

            >>> email_message = EmailMessage()
            >>> name, value, last_name = email_message._parse_header_line(
            ...   'Mime-Version: 1.0', 'Subject')
            >>> name == 'Mime-Version'
            True
            >>> value == '1.0'
            True
        '''

        if line is None:
            name = value = last_name = None
        else:
            name, __, value = line.partition(':')
            if name is not None:
                name = name.strip()

            if name is None or len(name) <= 0:
                self.log_message("no header name in line: " + line)
                if last_name is not None:
                    old_value = self.get_header(last_name)
                    self.add_header(name, '{} {}\n'.format(old_value.strip('\n'), value.strip()))
            else:
                last_name = name
                if value is None:
                    value = ''
                else:
                    value = value.strip()

            try:
                # try adding the header line and see if python can parse it
                test_message = Message()
                test_message.__setitem__(name, value)
                if isinstance(self.parser, Parser):
                    temp_header = self.parser.parsestr(test_message.as_string(unixfrom=False))
                else:
                    temp_header = self.parser.parsebytes(test_message.as_string(unixfrom=False).encode())
                if temp_header.__len__() == 0:
                    self.log_message('bad header: {}'.format(line))
                    self.bad_header_lines.append(line)
                else:
                    # if the parser accept this header line, then keep it
                    self.add_header(name, value)
            except Exception:
                record_exception()
                self.bad_header_lines.append(line)

        return name, value, last_name

    def _set_content_encoding(self, name, value):
        '''
            Set encoding in content (internal use only).

            >>> email_message = EmailMessage()
            >>> email_message._set_content_encoding(
            ...   mime_constants.CONTENT_TYPE_KEYWORD, 'charset=utf-8')
        '''

        if name is None or value is None:
            self.log_message('no name or value defined while trying to set content encoding')

        elif name == mime_constants.CONTENT_TYPE_KEYWORD:
            try:
                # try to set the charset
                index = value.find('charset=')
                if index >= 0:
                    charset = value[index + len('charset='):]
                    if charset.startswith('"') and charset.endswith('"'):
                        charset = charset[1:len(charset)-1]
                    self._message.set_charset(charset)
            except Exception:
                record_exception()
                self._message.set_charset(constants.DEFAULT_CHAR_SET)

        elif name == mime_constants.CONTENT_XFER_ENCODING_KEYWORD:
            encoding_value = self._message.get(
               mime_constants.CONTENT_XFER_ENCODING_KEYWORD)
            self.log_message('message encoding: {}'.format(encoding_value))
            if encoding_value is None or encoding_value.lower() != value.lower():
                self._message.__delitem__(name)
                self._message.__setitem__(name, value)
                self.log_message('set message encoding: {}'.format(value))

    def _get_decoded_payload(self, msg, decode=True):
        '''
            Get the payload and decode it if necessary.

            >>> email_message = EmailMessage()
            >>> email_message._get_decoded_payload(None)
        '''
        if msg is None:
            payload = None
        else:
            payload = msg.get_payload(decode=decode)

            if isinstance(payload, bytearray) or isinstance(payload, bytes):
                charset, __ = get_charset(msg, self._last_charset)
                self.log_message('decoding payload with char set: {}'.format(charset))
                try:
                    payload = payload.decode(encoding=charset)
                except:
                    payload = payload.decode(encoding=charset, errors='replace')


        return payload

    def _create_new_header(self, message_string):
        '''
            Create a new header from a corrupted message (internal use only).

            >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name
            >>> with open(get_plain_message_name('basic.txt')) as input_file:
            ...    message_string = ''.join(input_file.readlines())
            ...    email_message = EmailMessage()
            ...    body_text_lines = email_message._create_new_header(message_string)
            ...    len(body_text_lines) > 0
            True
        '''

        last_name = None
        body_text_lines = None

        if message_string is None:
            self.log_message('no message string defined to create new header')
        else:
            self.log_message('starting to parse headers')
            lines = message_string.split('\n')
            header_count = 0
            for line in lines:

                if line is None or len(line.strip()) <= 0:
                    self.log_message('finished parsing headers')
                    if header_count + 1 <= len(lines):
                        body_text_lines = lines[header_count + 1:]
                    else:
                        body_text_lines = []
                    break

                else:
                    header_count += 1
                    name, value, last_name = self._parse_header_line(line, last_name)

                    if (name is not None and
                        (name == mime_constants.CONTENT_TYPE_KEYWORD or
                         name == mime_constants.CONTENT_XFER_ENCODING_KEYWORD) ):

                        self._set_content_encoding(name, value)

        return body_text_lines


    def _create_new_body_text(self, body):
        '''
            Create the body text from a corrupted message.

            >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name
            >>> with open(get_plain_message_name('basic.txt')) as input_file:
            ...    email_message = EmailMessage(input_file.readlines())
            ...    email_message._create_new_body_text('Test new body text')
        '''

        charset, __ = get_charset(self._message, self._last_charset)
        self.log_message('creating new body text with char set: {}'.format(charset))
        try:
            body_text = ''
            for line in body:
                body_text += line.encode(charset)
        except Exception as body_exception:
            self.log_message(body_exception)
            record_exception()
            body_text = ''.join(body)

        if len(self.bad_header_lines) > 0:
            body_text += '\n\n{}\n'.format(i18n('Removed bad header lines'))
            for bad_header_line in self.bad_header_lines:
                body_text += '  {}\n'.format(bad_header_line)

        self._message.set_payload(body_text, charset=charset)

    def _create_good_message_from_bad(self, source):
        '''
            Create a good message from a source that contains a corrupted message.

            >>> from goodcrypto_tests.mail.message_utils import get_plain_message_name
            >>> with open(get_plain_message_name('bad-basic.txt')) as input_file:
            ...    email_message = EmailMessage()
            ...    email_message._create_good_message_from_bad(input_file)
        '''

        try:
            # start with a fresh message
            self._message = Message()

            if isinstance(source, IOBase):
                source.seek(os.SEEK_SET)
                message_string = source.read()
            else:
                message_string = source

            body_text = self._create_new_header(message_string)
            if body_text:
                self._create_new_body_text(body_text)

        except Exception as message_exception:
            self.log_message(message_exception)
            record_exception()
            raise MessageException(message_exception)

    def init_new_message(self, from_addr, to_addr, subject, text=None):
        ''' Initialize a basic new message.

            Used primarily for testing.

            >>> # In honor of Kirk Wiebe, a whistleblower about Trailblazer, an NSA mass surveillance project.
            >>> from_user = '******'
            >>> to_user = '******'
            >>> email_message = EmailMessage()
            >>> email_message.init_new_message(from_user, to_user, "Test message", 'Test body text')
        '''

        self.add_header(mime_constants.FROM_KEYWORD, from_addr)
        self.add_header(mime_constants.TO_KEYWORD, to_addr)
        self.add_header(mime_constants.SUBJECT_KEYWORD, subject)

        if text:
            self.set_text(text)


    def log_message_exception(self, exception_error, message, log_msg):
        '''
            Log an exception.

            >>> from syr.log import BASE_LOG_DIR
            >>> from syr.user import whoami
            >>> email_message = EmailMessage()
            >>> email_message.log_message_exception(Exception, 'message', 'log message')
            >>> os.path.exists(os.path.join(BASE_LOG_DIR, whoami(), 'goodcrypto.mail.message.email_message.log'))
            True
            >>> os.path.exists(os.path.join(BASE_LOG_DIR, whoami(), 'syr.exception.log'))
            True
        '''

        self.log_exception(log_msg, message_exception=exception_error)
        if message != None:
            try:
                self.log_message("message:\n" + message.to_string())
            except Exception as exception_error2:
                self.log_message("unable to log message: {}".format(exception_error2))


    def log_exception(self, log_msg, message_exception=None):
        '''
            Log an exception.

            >>> from syr.log import BASE_LOG_DIR
            >>> from syr.user import whoami
            >>> email_message = EmailMessage()
            >>> email_message.log_exception('test')
            >>> os.path.exists(os.path.join(BASE_LOG_DIR, whoami(), 'goodcrypto.mail.message.email_message.log'))
            True
            >>> os.path.exists(os.path.join(BASE_LOG_DIR, whoami(), 'syr.exception.log'))
            True
            >>> email_message.log_exception('test', message_exception='message exception')
        '''

        record_exception()

        self.log_message(log_msg)
        record_exception(message=log_msg)

        if message_exception is not None:
            if type(message_exception) == Exception:
                self.log_message(message_exception.value)
                record_exception(message=message_exception.value)
            elif type(message_exception) == str:
                self.log_message(message_exception)
                record_exception(message=message_exception)

    def log_message(self, message):
        '''
            Log a message.

            >>> from syr.log import BASE_LOG_DIR
            >>> from syr.user import whoami
            >>> email_message = EmailMessage()
            >>> email_message.log_message('test')
            >>> os.path.exists(os.path.join(BASE_LOG_DIR, whoami(), 'goodcrypto.mail.message.email_message.log'))
            True
        '''

        if self._log is None:
            self._log = LogFile()

        self._log.write_and_flush(message)