def test_link(self): html = '<a href="http://www.test.com">Test link title</a>' result = 'Test link title <http://www.test.com>' self.assertEqual(convert_html_to_text(html, keep_linebreaks=True), result)
def test_list_to_newlines(self): html = """ <html> <head> <style type="text/css"> body { background-image: url(dolfijn.jpg); background-position: right center; background-repeat: no-repeat; font-family: sans; font-size: 36pt; font-weight: bold; } </style> <title>Dolfijnwoorden</title> </head> <body> <h3>Dolfijnwoorden 26-02-2014</h3> <ul> <li>Wit</li> <li>Badpak</li> <li>Bedisputeren</li> <li>Dolfijnwoord</li> <li>Hogere wiskunde</li> <li>Moeder</li> <li>Pinpointen</li> <li>Redetwisten</li> <li>Schildpadwoord</li> <li>Sukkelseks</li> <li>Vakantie</li> <li>Vingerpistool</li> <li>Voor jouw beeldvorming</li> <li>Never gonna give you up</li> <li>Never gonna let you down</li> </ul> </body> </html>""" result = """ Dolfijnwoorden 26-02-2014 Wit Badpak Bedisputeren Dolfijnwoord Hogere wiskunde Moeder Pinpointen Redetwisten Schildpadwoord Sukkelseks Vakantie Vingerpistool Voor jouw beeldvorming Never gonna give you up Never gonna let you down """ self.assertEqual(convert_html_to_text(html, keep_linebreaks=True), result)
def test_br_to_space(self): html = 'Hello VoipGRID,<br><br>This is a test' result = 'Hello VoipGRID, This is a test' self.assertEqual(convert_html_to_text(html, keep_linebreaks=False), result)
def test_remove_different_source_of_tags(self): html = 'Hello VoipGRID,<br><br><i>This is a test</i><br><br><b><i>dsfg</i></b><br><b>dsfg</b><br><b>ds</b>' result = """Hello VoipGRID, This is a test dsfg dsfg ds""" self.assertEqual(convert_html_to_text(html, keep_linebreaks=True), result)
def test_list_to_newlines(self): html = """Hi there! <script>console.log('hello');</script> <ul><li>1</li> <li>2</li></ul> Bye! """ result = """Hi there! 1 2 Bye! """ self.assertEqual(convert_html_to_text(html, keep_linebreaks=True), result)
def parse_message(message, remove_tags=[]): """ Parse an email.message.Message instance. """ text = '' html = '' attachments = [] inline_attachments = {} for message_part in message.walk(): # if message.get_content_maintype() == 'multipart': is_attachment = parse_attachment(message_part, attachments, inline_attachments) if is_attachment: continue if message_part is None: continue content_type, body = parse_body(message_part, remove_tags=remove_tags) if content_type == 'text/html': html += body elif content_type == 'text/plain': text += body elif not any([content_type, body]): continue if message_part.get_content_maintype() == 'multipart': continue if message_part.get('Content-Disposition') is None: continue if len(text) > 0: text = convert_html_to_text(text) if len(html) > 0 and len(inline_attachments) > 0: soup = BeautifulSoup(html) inline_images = soup.findAll('img', {'src': lambda src: src and src.startswith('cid:')}) cids_in_body = [] for image in inline_images: cids_in_body.append(image.get('src')[4:]) for cid, inline_attachment in inline_attachments.items(): if cid not in cids_in_body: del inline_attachments[cid] return text, html, attachments, inline_attachments
def test_convert_nbsp_to_space(self): html = 'Hello VoipGRID' result = 'Hello VoipGRID' self.assertEqual(convert_html_to_text(html), result)
def create_message_query_string(message, account_id, folder_name): """ Create query string for message. Arguments: message (instance): Message object account_id (int): id of the account folder_name (string): name of de folder on the server Returns: total_query_string (str): custom query string params_list (list): list of parameters for query string query_count (int): number of queries in query string """ param_list = [] total_query_string = '' query_count = 0 query_string = 'UPDATE email_emailmessage SET is_deleted = FALSE, ' message_flags = message.get_flags() if message_flags: query_string += 'flags = %s, ' param_list.append(str(message_flags)) body_html = message.get_html_body(remove_tags=settings.BLACKLISTED_EMAIL_TAGS) body_text = message.get_text_body() if body_html is not None and not body_text: body_text = convert_html_to_text(body_html, keep_linebreaks=True) if body_html is not None: query_string += 'body_html = %s, ' param_list.append(replace_anchors_in_html(body_html)) if body_text is not None: query_string += 'body_text = %s, ' param_list.append(escape(body_text)) if query_string.endswith(', '): query_string = query_string.rstrip(', ') query_string += ' WHERE account_id = %s AND uid = %s AND folder_name = %s;\n' param_list.append(account_id) param_list.append(message.uid) param_list.append(folder_name) total_query_string += query_string query_count += 1 message_sent_date = message.get_sent_date() query_string = 'UPDATE messaging_message SET ' if message_flags: query_string += 'is_seen = %s, ' param_list.append(SEEN in message_flags) query_string += 'sent_date = %s' param_list.append(datetime.strftime(message_sent_date, '%Y-%m-%d %H:%M:%S%z')) query_string += ' WHERE historylistitem_ptr_id = (SELECT message_ptr_id FROM email_emailmessage WHERE account_id = %s AND uid = %s AND folder_name = %s);\n' param_list.append(account_id) param_list.append(message.uid) param_list.append(folder_name) total_query_string += query_string query_count += 1 return total_query_string, param_list, query_count
def save_email_message(message, account, folder, email_ctype): """ Get or Create existing message or create a new one Arguments: message (instance): Message object account (instance): The email account instance to which every message will be linked folder (string): The remote folder where the message is stored email_ctype (integer): ctype id of the EmailMessage class Returns: email_headers (list): List of EmailHeaders email_address_headers (list): List of EmailAddressHeaders email_attachments (list): of List of EmailAttachments inline_email_attachments (list) of EmailAttachments """ sent_date = message.get_sent_date() email_message = EmailMessage.objects.get_or_create( uid=message.uid, folder_name=folder.name_on_server, account=account, sent_date=sent_date, tenant=account.tenant, )[0] message_flags = message.get_flags() if message_flags: email_message.is_seen = SEEN in message_flags email_message.flags = message_flags body_html = message.get_html_body(remove_tags=settings.BLACKLISTED_EMAIL_TAGS) body_text = message.get_text_body() if body_html is not None and not body_text: body_text = convert_html_to_text(body_html, keep_linebreaks=True) elif body_text is not None: body_text = escape(body_text) # Check for headers headers = message.get_headers() email_headers = None email_address_headers = None if headers is not None: email_headers, email_address_headers, message_identifier = get_headers_and_identifier(headers) if message_identifier: email_message.message_identifier = message_identifier # Check if message is sent from account name, from_email = message.get_send_from() if account.email.email_address == from_email: email_message.sent_from_account = True email_message.body_html = replace_anchors_in_html(body_html) email_message.body_text = body_text email_message.size = message.get_size() email_message.folder_identifier = folder.identifier email_message.is_private = False email_message.tenant = account.tenant email_message.polymorphic_ctype = email_ctype email_message.save() # Check for attachments email_attachments = None attachments = message.get_attachments() if len(attachments): email_attachments = create_email_attachments( attachments, account.tenant_id ) # Check for inline attachments inline_email_attachments = None inline_attachments = message.get_inline_attachments().items() if len(inline_attachments): inline_email_attachments = create_email_attachments( inline_attachments, account.tenant_id, inline=True ) return email_headers, email_address_headers, email_attachments, inline_email_attachments
def test_br_to_newline(self): html = 'Hello VoipGRID,<br><br>This is a test' result = 'Hello VoipGRID,\n\nThis is a test' self.assertEqual(convert_html_to_text(html, keep_linebreaks=True), result)