def _process_single_node(self, node): """ Extract the contents of a single XML dump node :param node: The XML node corresponding to a message :return: An EmailMessage instance containing the message contents """ text = unicode(node.find('text').text) text = unicode.lstrip(text, u'>') # remove leading char that got into the text somehow if use_full_parser(text): text = fix_broken_hotmail_headers(text) parser = Parser() mime_message = parser.parse(StringIO(text)) return_message = get_nested_payload(mime_message) else: return_message = EmailMessage() subject_node = node.find('subject') from_node = node.find('from') to_node = node.find('to') date_node = node.find('receivedat') subject = unicode(subject_node.text, 'utf-8') if not subject_node is None else '' sender = clean_sender('{} <{}>'.format(from_node.find('name').text, from_node.find('email').text)) recipient = clean_recipient('{} <{}>'.format(to_node.find('name').text, to_node.find('email').text)) date_string = '{} {}'.format(date_node.find('date').text, date_node.find('time').text) return_message.append_body(unicode(text)) return_message.subject = subject return_message.sender = sender return_message.recipient = recipient return_message.date = parse(date_string) return_message.date = normalize_to_utc(return_message.date, self._timezone) return_message.source = "XML File {} node {}".format(self._process_path, node.attrib) return return_message
def _process_multipart_eml(file_path): """ Given an EML file, clean it up, parse it, and extract the contents we want to keep. :param file_path: The path to the EML file to process :return: A structured EmailMessage instance """ with codecs.open(file_path, 'rb', 'windows-1252') as text_file: text = unicode(''.join(text_file.readlines())) if use_full_parser(text): text = fix_broken_yahoo_headers(text) parser = Parser() mime_message = parser.parse(StringIO(text)) return_message = get_nested_payload(mime_message) return_message.source = "EML File {}".format(file_path) return return_message
def _process_single_node(self, node): """ Extract the contents of a single XML dump node :param node: The XML node corresponding to a message :return: An EmailMessage instance containing the message contents """ text = unicode(node.find('text').text) text = unicode.lstrip( text, u'>') # remove leading char that got into the text somehow if use_full_parser(text): text = fix_broken_hotmail_headers(text) parser = Parser() mime_message = parser.parse(StringIO(text)) return_message = get_nested_payload(mime_message) else: return_message = EmailMessage() subject_node = node.find('subject') from_node = node.find('from') to_node = node.find('to') date_node = node.find('receivedat') subject = unicode(subject_node.text, 'utf-8') if not subject_node is None else '' sender = clean_sender('{} <{}>'.format( from_node.find('name').text, from_node.find('email').text)) recipient = clean_recipient('{} <{}>'.format( to_node.find('name').text, to_node.find('email').text)) date_string = '{} {}'.format( date_node.find('date').text, date_node.find('time').text) return_message.append_body(unicode(text)) return_message.subject = subject return_message.sender = sender return_message.recipient = recipient return_message.date = parse(date_string) return_message.date = normalize_to_utc(return_message.date, self._timezone) return_message.source = "XML File {} node {}".format( self._process_path, node.attrib) return return_message