Exemplo n.º 1
0
    def extract_headers_metadata(self, headers):
        self.result.headers = safe_dict(dict(headers))
        headers = [(safe_string(k), safe_string(v)) for k, v in headers]
        for field, value in headers:
            field = field.lower()
            if field is None or value is None:
                continue

            if field == 'subject':
                self.update('title', value)

            if field == 'message-id':
                self.update('message_id', value)

            if field == 'in-reply-to':
                self.result.emit_in_reply_to(value)
            if field == 'references':
                for email_addr in value.split():
                    self.result.emit_in_reply_to(email_addr)

            if field == 'date':
                date = value
                try:
                    date = email.utils.parsedate(date)
                    date = datetime.fromtimestamp(mktime(date))
                    self.update('created_at', date)
                except Exception as ex:
                    log.warning("Failed to parse [%s]: %s", date, ex)

            if field == 'from':
                for (name, _) in self.parse_emails(value):
                    self.update('author', name)

            if field in ['to', 'cc', 'bcc']:
                self.parse_emails(value)
Exemplo n.º 2
0
    def ingest(self, file_path):
        self.result.flag(self.result.FLAG_EMAIL)
        try:
            doc = self.parse_xml(file_path)
        except TypeError:
            raise ProcessingException("Cannot parse OPF XML file.")

        if len(doc.findall('//email')) != 1:
            raise ProcessingException("More than one email in file.")

        email = doc.find('//email')
        props = email.getchildren()
        props = {c.tag: safe_string(c.text) for c in props if c.text}
        headers = {
            'Subject': props.get('OPFMessageCopySubject'),
            'Message-ID': props.pop('OPFMessageCopyMessageID', None),
            'From': self.get_contacts(email, 'OPFMessageCopyFromAddresses'),
            'Sender': self.get_contacts(email, 'OPFMessageCopySenderAddress'),
            'To': self.get_contacts(email, 'OPFMessageCopyToAddresses'),
            'CC': self.get_contacts(email, 'OPFMessageCopyCCAddresses'),
            'BCC': self.get_contacts(email, 'OPFMessageCopyBCCAddresses'),
        }
        date = props.get('OPFMessageCopySentTime')
        if date is not None:
            date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S')
            date = time.mktime(date.timetuple())
            headers['Date'] = utils.formatdate(date)

        self.result.headers = safe_dict(headers)

        self.update('title', props.pop('OPFMessageCopySubject', None))
        self.update('title', props.pop('OPFMessageCopyThreadTopic', None))
        for tag in ('OPFMessageCopyFromAddresses',
                    'OPFMessageCopySenderAddress'):
            self.update('author', self.get_contact_name(email, tag))

        self.update('summary', props.pop('OPFMessageCopyPreview', None))
        self.update('created_at', props.pop('OPFMessageCopySentTime', None))
        self.update('modified_at', props.pop('OPFMessageCopyModDate', None))

        body = props.pop('OPFMessageCopyBody', None)
        html = props.pop('OPFMessageCopyHTMLBody', None)

        has_html = '1E0' == props.pop('OPFMessageGetHasHTML', None)
        if has_html and safe_string(html):
            self.extract_html_content(html)
            self.result.flag(self.result.FLAG_HTML)
        else:
            self.extract_plain_text_content(body)
            self.result.flag(self.result.FLAG_PLAINTEXT)
Exemplo n.º 3
0
    def _parse_headers(self, message):
        headers = message.getField('007D')
        if headers is not None:
            try:
                message = Parser().parsestr(headers, headersonly=True)
                self.extract_headers_metadata(message.items())
                return
            except Exception:
                log.warning("Cannot parse headers: %s" % headers)

        self.result.headers = safe_dict({
            'Subject': message.getField('0037'),
            'BCC': message.getField('0E02'),
            'CC': message.getField('0E03'),
            'To': message.getField('0E04'),
            'From': message.getField('1046'),
            'Message-ID': message.getField('1035'),
        })