def test_link_breaks_quotation_markers_sequence(): # link starts and ends on the same line msg_body = """Blah On Thursday, October 25, 2012 at 3:03 PM, life is short. on Bob wrote: > > Post a response by replying to this email > (http://example.com/c/YzOTYzMmE) > > life is short. (http://example.com/c/YzMmE) > """ eq_("Blah", quotations.extract_from_plain(msg_body)) # link starts after some text on one line and ends on another msg_body = """Blah On Monday, 24 September, 2012 at 3:46 PM, bob wrote: > [Ticket #50] test from bob > > View ticket (http://example.com/action _nonce=3dd518) > """ eq_("Blah", quotations.extract_from_plain(msg_body))
def test_pattern_original_message(): msg_body = """Test reply -----Original Message----- Test""" eq_("Test reply", quotations.extract_from_plain(msg_body)) msg_body = """Test reply -----Original Message----- Test""" eq_("Test reply", quotations.extract_from_plain(msg_body)) msg_body = """Test reply -----Urspr=C3=BCngliche Nachricht----- Test""" eq_("Test reply", quotations.extract_from_plain(msg_body)) msg_body = u"""Test reply -----Ursprüngliche Nachricht----- Test""" eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_appointment(): msg_body = """Response 10/19/2017 @ 9:30 am for physical therapy Bla 1517 4th Avenue Ste 300 London CA 19129, 555-421-6780 John Doe, FCLS Mailgun Inc 555-941-0697 From: [email protected] [mailto:[email protected]] Sent: Wednesday, October 18, 2017 2:05 PM To: John Doer - SIU <*****@*****.**> Subject: RE: Claim # 5551188-1 Text""" expected = """Response 10/19/2017 @ 9:30 am for physical therapy Bla 1517 4th Avenue Ste 300 London CA 19129, 555-421-6780 John Doe, FCLS Mailgun Inc 555-941-0697""" eq_(expected, quotations.extract_from_plain(msg_body))
def _check_pattern_original_message(original_message_indicator): msg_body = u"""Test reply -----{}----- Test""" eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator))))
def test_norwegian_from_line(): eq_('Lorem', quotations.extract_from_plain( u"""Lorem På 14 september 2015 på 02:23:18, Valentino Rudy ([email protected]) skrev: Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. """))
def test_pattern_original_message(): msg_body = """Test reply -----Original Message----- Test""" eq_("Test reply", quotations.extract_from_plain(msg_body)) msg_body = """Test reply -----Original Message----- Test""" eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_reply_after_quotations(): msg_body = """On 04/19/2011 07:10 AM, Roman Tkachenko wrote: > > Test Test reply""" eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_too_many_lines(): msg_body = """Test reply Hi -----Original Message----- Test""" eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_short_quotation(): msg_body = """Hi On 04/19/2011 07:10 AM, Roman Tkachenko wrote: > Hello""" eq_("Hi", quotations.extract_from_plain(msg_body))
def test_french_from_block(): eq_('Lorem ipsum', quotations.extract_from_plain( u"""Lorem ipsum Le 23 janv. 2015 à 22:03, Brendan xxx <[email protected]<mailto:[email protected]>> a écrit: Bonjour!"""))
def test_vietnamese_from_block(): eq_('Hello', quotations.extract_from_plain( u"""Hello Vào 14:24 8 tháng 6, 2017, Hùng Nguyễn <*****@*****.**> đã viết: > Xin chào """))
def test_pattern_on_date_wrote_somebody(): eq_('Lorem', quotations.extract_from_plain( """Lorem Op 13-02-2014 3:18 schreef Julius Caesar <*****@*****.**>: Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. """))
def test_dutch_from_block(): eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( """Gluten-free culpa lo-fi et nesciunt nostrud. Op 17-feb.-2015, om 13:18 heeft Julius Caesar <*****@*****.**> het volgende geschreven: Small batch beard laboris tempor, non listicle hella Tumblr heirloom. """))
def test_with_indent(): msg_body = """YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin. ------On 12/29/1987 17:32 PM, Julius Caesar wrote----- Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. """ eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body))
def test_from_block_starts_with_date(): msg_body = """Blah Date: Wed, 16 May 2012 00:15:02 -0600 To: [email protected] """ eq_('Blah', quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote_allows_space_in_front(): msg_body = """Thanks Thanmai On Mar 8, 2012 9:59 AM, "Example.com" < *****@*****.**> wrote: >** > Blah-blah-blah""" eq_("Thanks Thanmai", quotations.extract_from_plain(msg_body))
def test_android_wrote(): msg_body = """Test reply ---- John Smith wrote ---- > quoted > text """ eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_weird_date_format_in_date_block(): msg_body = """Blah Date: Fri=2C 28 Sep 2012 10:55:48 +0000 From: [email protected] To: [email protected] Subject: [Ticket #8] Test """ eq_('Blah', quotations.extract_from_plain(msg_body))
def test_polish_from_block(): eq_('Lorem ipsum', quotations.extract_from_plain( u"""Lorem ipsum W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx <*****@*****.**> napisał: Blah! """))
def test_feedback_below_left_unparsed(): msg_body = """Please enter your feedback below. Thank you. ------------------------------------- Enter Feedback Below ------------------------------------- The user experience was unparallelled. Please continue production. I'm sending payment to ensure that this line is intact.""" parsed = quotations.extract_from_plain(msg_body) eq_(msg_body, parsed.decode('utf8'))
def test_appointment(): msg_body = """Invitation for an interview: Date: Wednesday 3, October 2011 Time: 7 : 00am Address: 130 Fox St Please bring in your ID.""" parsed = quotations.extract_from_plain(msg_body) eq_(msg_body, parsed.decode('utf8'))
def test_date_time_email_splitter(): msg_body = """Test reply 2014-10-17 11:28 GMT+03:00 Postmaster < *****@*****.**>: > First from site > """ eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_link_closed_with_quotation_marker_on_new_line(): msg_body = '''8.45am-1pm From: [email protected] <http://email.example.com/c/dHJhY2tpbmdfY29kZT1mMDdjYzBmNzM1ZjYzMGIxNT > <[email protected] <mailto:[email protected]> > Requester: ''' eq_('8.45am-1pm', quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote_date_with_dots_german(): msg_body = """Test reply Am 25.11.2014 14:59 schrieb Roman Tkachenko: > > Test. > > Roman""" eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote_date_with_slashes(): msg_body = """Test reply On 04/19/2011 07:10 AM, Roman Tkachenko wrote: > > Test. > > Roman""" eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_sent(): msg_body = """Test reply On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <*****@*****.**> sent: > > Test > > Roman""" eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_english_from_block(): eq_('Allo! Follow up MIME!', quotations.extract_from_plain("""Allo! Follow up MIME! From: [email protected] Sent: March-19-11 5:42 PM To: Somebody Subject: The manager has commented on your Loop Blah-blah-blah """))
def test_quotation_separator_takes_3_lines(): msg_body = """Test reply On Nov 30, 2011, at 12:47 PM, Somebody < *****@*****.**> wrote: Test message """ eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_swedish_from_block(): eq_('Allo! Follow up MIME!', quotations.extract_from_plain( u"""Allo! Follow up MIME! Från: Anno Sportel [mailto:[email protected]] Skickat: den 26 augusti 2015 14:45 Till: Isacson Leiff Ämne: RE: Week 36 Blah-blah-blah """))
def test_bold_from_block(): msg_body = """Hi *From:* [email protected] [mailto: [email protected]] *Sent:* Wednesday, June 27, 2012 3:05 PM *To:* [email protected] *Subject:* Hello """ eq_("Hi", quotations.extract_from_plain(msg_body))
def test_french_multiline_from_block(): eq_( 'Lorem ipsum', quotations.extract_from_plain(u"""Lorem ipsum De : Brendan xxx [mailto:[email protected]] Envoyé : vendredi 23 janvier 2015 16:39 À : Camille XXX Objet : Follow Up Blah-blah-blah """))
def test_swedish_from_block(): eq_( "Allo! Follow up MIME!", quotations.extract_from_plain(u"""Allo! Follow up MIME! Från: Anno Sportel [mailto:[email protected]] Skickat: den 26 augusti 2015 14:45 Till: Isacson Leiff Ämne: RE: Week 36 Blah-blah-blah """), )
def test_pattern_on_date_polymail(): msg_body = """Test reply On Tue, Apr 11, 2017 at 10:07 PM John Smith < mailto:John Smith <*****@*****.**> > wrote: Test quoted data """ eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_german_from_block(): eq_( 'Allo! Follow up MIME!', quotations.extract_from_plain("""Allo! Follow up MIME! Von: [email protected] Gesendet: Dienstag, 25. November 2014 14:59 An: Somebody Betreff: The manager has commented on your Loop Blah-blah-blah """))
def test_english_from_block(): eq_( 'Allo! Follow up MIME!', quotations.extract_from_plain("""Allo! Follow up MIME! From: [email protected] Sent: March-19-11 5:42 PM To: Somebody Subject: The manager has commented on your Loop Blah-blah-blah """))
def test_danish_from_block(): eq_( 'Allo! Follow up MIME!', quotations.extract_from_plain("""Allo! Follow up MIME! Fra: [email protected] Sendt: 19. march 2011 12:10 Til: Somebody Emne: The manager has commented on your Loop Blah-blah-blah """))
def test_dont_parse_quotations_for_forwarded_messages(): msg_body = """FYI ---------- Forwarded message ---------- From: [email protected] Date: Tue, Sep 4, 2012 at 1:35 PM Subject: Two line subject To: [email protected] Text""" eq_(msg_body, quotations.extract_from_plain(msg_body))
def push_to_api(message): email_id = message['id'] from_header = get_header(message, 'From') if '<' in from_header: from_email = from_header.split(' <')[1].split('>')[0] from_name = from_header.split(' <')[0] from_name = from_name.strip('"') if from_name == from_email: from_name = None else: from_email = from_header from_name = None subject = get_header(message, 'Subject') if 'parts' in message['payload']: full_html = get_body_by_mime_type(message, 'text/html') html_reply = quotations.extract_from_html(full_html) full_text = get_body_by_mime_type(message, 'text/plain') text_reply = quotations.extract_from_plain(full_text) elif 'mimeType' in message['payload'] and message['payload'][ 'mimeType'] == 'text/html' and 'body' in message[ 'payload'] and 'snippet' in message: full_html = decode_base_64_data(message['payload']['body']['data']) html_reply = quotations.extract_from_html(full_html) full_text = message['snippet'] text_reply = message['snippet'] else: raise 'Unsupported email format' payload = { 'emailId': email_id, 'fromEmail': from_email, 'fromName': from_name, 'subject': subject, 'fullHtml': full_html, 'htmlReply': html_reply, 'fullText': full_text, 'textReply': text_reply, } headers = { 'authorization': os.getenv('API_AUTHORIZATION_HEADER'), 'accept': 'application/vnd.faultfixers.v14+json', 'content-type': 'application/json', } response = requests.post(os.getenv('API_ENDPOINT'), headers=headers, json=payload) response.raise_for_status()
def test_feedback_below_left_unparsed(): msg_body = """Please enter your feedback below. Thank you. ------------------------------------- Enter Feedback Below ------------------------------------- The user experience was unparallelled. Please continue production. I'm sending payment to ensure that this line is intact.""" parsed = quotations.extract_from_plain(msg_body) if isinstance(parsed, bytes): parsed = parsed.decode('utf8') eq_(msg_body, parsed)
def test_appointment_2(): msg_body = """Invitation for an interview: Date: Wednesday 3, October 2011 Time: 7 : 00am Address: 130 Fox St Please bring in your ID.""" parsed = quotations.extract_from_plain(msg_body) if isinstance(parsed, bytes): parsed = parsed.decode('utf8') eq_(msg_body, parsed)
def test_reply_and_quotation_splitter_share_line(): # reply lines and 'On <date> <person> wrote:' splitter pattern # are on the same line msg_body = """reply On Wed, Apr 4, 2012 at 3:59 PM, [email protected] wrote: > Hi""" eq_('reply', quotations.extract_from_plain(msg_body)) # test pattern '--- On <date> <person> wrote:' with reply text on # the same line msg_body = """reply--- On Wed, Apr 4, 2012 at 3:59 PM, [email protected] wrote: > Hi""" eq_('reply', quotations.extract_from_plain(msg_body)) # test pattern '--- On <date> <person> wrote:' with reply text containing # '-' symbol msg_body = """reply bla-bla - bla--- On Wed, Apr 4, 2012 at 3:59 PM, [email protected] wrote: > Hi""" reply = """reply bla-bla - bla""" eq_(reply, quotations.extract_from_plain(msg_body))
def extract_body(message: message.Message) -> str: # If the message contains a plaintext version of the body, use # that. plaintext_content = get_message_part_by_type(message, "text/plain") if plaintext_content: return quotations.extract_from_plain(plaintext_content) # If we only have an HTML version, try to make that look nice. html_content = get_message_part_by_type(message, "text/html") if html_content: return convert_html_to_markdown(quotations.extract_from_html(html_content)) raise ZulipEmailForwardError("Unable to find plaintext or HTML message body")
def get_message_body(message: EmailMessage) -> Optional[str]: """ Get the core message body part as a cleaned string. In terms of the goal of the project, we are interested in unique unstructured text of a reasonable length. So we discard forwarded/replied emails, and those which are too short/long. NOTE: As part of the identification of actionable emails, the above is subject to change, but for now the presumption is otherwise. The Enron dataset used for testing seems to have quite a number of poorly parsed email bodies. Issues such as incorrect splitting of urls across lines cause havoc without attempts to clean the data. Will have to make due for now, but some 'strange' parsing seem here is a direct result of the data used. READING: * General discussion on the topic. https://en.wikipedia.org/wiki/Posting_style * MailGun sited these papers for their 'Talon' project. http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf http://www.cs.cornell.edu/people/tj/publications/joachims_01a.pdf :param message: a parsed EmailMessage :return: cleaned message body as a string """ core_message: Optional[EmailMessage] = message.get_body() # type: ignore if not isinstance(core_message, EmailMessage): return None potential_message_body: Optional[str] = extract_core_message_body(core_message) if not potential_message_body or not isinstance(potential_message_body, str): return None # Raw body as a string message_body: str = potential_message_body # Remove inline mails message_body = remove_inline_message(message_body) # Check length now to prevent unnecessary processing if not is_valid_length(text=message_body, minimum=250, maximum=5_000): return None # Handle HTML in text if "html" in core_message.get_content_subtype(): message_body = strip_html_contents(text=message_body) # Use Talon to attempt to remove message quotations message_body = str(quotations.extract_from_plain(message_body)) return message_body
def post(self, request): """ Receive conversation replies via e-mail """ auth_key = request.META.get('HTTP_X_MESSAGESYSTEMS_WEBHOOK_TOKEN') if auth_key is None or auth_key != settings.SPARKPOST_RELAY_SECRET: return Response( status=status.HTTP_403_FORBIDDEN, data={ 'message': 'Invalid HTTP_X_MESSAGESYSTEMS_WEBHOOK_TOKEN header' }) for messages in [e['msys'].values() for e in request.data]: for message in messages: # 1. get email content and reply-to reply_to = parseaddr(message['rcpt_to'])[1] content = message['content'] # 2. check local part of reply-to and extract conversation and user (fail if they don't exist) local_part = reply_to.split('@')[0] conversation_id, user_id, thread_id = parse_local_part( local_part) user = get_user_model().objects.get(id=user_id) thread = None if thread_id is not None: thread = ConversationMessage.objects.get(id=thread_id) conversation = thread.conversation else: conversation = Conversation.objects.get(id=conversation_id) if not conversation.participants.filter(id=user.id).exists(): raise Exception('User not in conversation') # 3. extract the email reply text and add it to the conversation text_content = content['text'] reply_plain = quotations.extract_from_plain(text_content) ConversationMessage.objects.create( author=user, conversation=conversation, thread=thread, content=reply_plain, received_via='email', ) return Response(status=status.HTTP_200_OK, data={})
def test_reply_wraps_quotations(): msg_body = """Test reply On 04/19/2011 07:10 AM, Roman Tkachenko wrote: > > Test Regards, Roman""" reply = """Test reply Regards, Roman""" eq_(reply, quotations.extract_from_plain(msg_body))
def test_forwarded_message_in_quotations(): msg_body = """Blah -----Original Message----- FYI ---------- Forwarded message ---------- From: [email protected] Date: Tue, Sep 4, 2012 at 1:35 PM Subject: Two line subject To: [email protected] """ eq_("Blah", quotations.extract_from_plain(msg_body))
def test_short_quotation_with_newline(): msg_body = """Btw blah blah... On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" <*****@*****.**> wrote: Hi Mark, Blah blah? Thanks,Christine On Jan 27, 2015, at 11:55 AM, Mark XXX <*****@*****.**> wrote: Lorem ipsum? Mark Sent from Acompli""" eq_("Btw blah blah...", quotations.extract_from_plain(msg_body))
def test_quotation_separator_takes_2_lines(): msg_body = """Test reply On Fri, May 6, 2011 at 6:03 PM, Roman Tkachenko from Hacker News <*****@*****.**> wrote: > Test. > > Roman Regards, Roman""" reply = """Test reply Regards, Roman""" eq_(reply, quotations.extract_from_plain(msg_body))
def test_standard_replies(): for filename in os.listdir(STANDARD_REPLIES): filename = os.path.join(STANDARD_REPLIES, filename) if not filename.endswith('.eml') or os.path.isdir(filename): continue with open(filename) as f: message = email.message_from_file(f) body = next(email.iterators.typed_subpart_iterator(message, subtype='plain')) text = ''.join(body_iterator(body, True)) stripped_text = quotations.extract_from_plain(text) reply_text_fn = filename[:-4] + '_reply_text' if os.path.isfile(reply_text_fn): with open(reply_text_fn) as f: reply_text = f.read().strip() else: reply_text = 'Hello' yield eq_, reply_text, stripped_text, \ "'%(reply)s' != %(stripped)s for %(fn)s" % \ {'reply': reply_text, 'stripped': stripped_text, 'fn': filename}
def test_standard_replies(): for filename in os.listdir(STANDARD_REPLIES): filename = os.path.join(STANDARD_REPLIES, filename) if not filename.endswith(".eml") or os.path.isdir(filename): continue with open(filename) as f: message = email.message_from_file(f) body = next( email.iterators.typed_subpart_iterator(message, subtype="plain")) text = "".join(body_iterator(body, True)) stripped_text = quotations.extract_from_plain(text) reply_text_fn = filename[:-4] + "_reply_text" if os.path.isfile(reply_text_fn): with open(reply_text_fn) as f: reply_text = f.read().strip() else: reply_text = "Hello" yield eq_, reply_text, stripped_text, "'%(reply)s' != %(stripped)s for %(fn)s" % { "reply": reply_text, "stripped": stripped_text, "fn": filename, }
def test_empty_body(): eq_('', quotations.extract_from_plain(''))
def extractBody(self, s): body = self.extractBodyFromEmail(s) reply = quotations.extract_from_plain(body) text, signature = extract_signature(reply) return text
__author__ = 'a_medelyan' import talon from talon import quotations from talon.signature.bruteforce import extract_signature talon.init() text = "The price is still 91.87.\n\nKeoni Almeida\nCalifornia Independent System Operator\nphone: 916/608-7053\npager: 916/814-7352\nalpha page: [email protected]\ne-mail: <mailto:[email protected]>\n\n\n\n> -----Original Message-----\n> From:\tCRCommunications\n> Sent:\tFriday, June 22, 2001 11:34 AM\n> To:\tISO Market Participants\n> Subject:\tCAISO Notice: Update to June 20 Market Notice\n>\n> <<MARKET NOTICE 010622_.doc>>\n>\n> Market Participants:\n> Please read the attached explanation of Footnote 14 in the California ISO\n> June 20, 2001, Market Notice.\n>\n> CR Communications\n> Client Relations Communications\n\n - MARKET NOTICE 010622_.doc" reply = quotations.extract_from_plain(text) signature = extract_signature(text)[1] print "Reply: ", reply print "Signature: ", signature
# print message_id # print csv_signature # print csv_authored_content # print "-------" text = dataset[message_id] # find talon signatures results = extract_signature(text) if results[1]: talon_signature = results[1].split('\n') else: talon_signature = [] #find talon authored content talon_authored_content = quotations.extract_from_plain(text).split('\n') # do a comparative scoring of results found if len(talon_signature) > 0 or len(csv_signature) > 0: required = set(csv_signature) signature_lines_total += len(csv_signature) for line in talon_signature: if len(line) > 0: signature_lines_talon += 1.0 if line in required: signature_lines_correct += 1.0 if len(talon_authored_content) > 0 or len(csv_authored_content) > 0: required = set(csv_authored_content) ac_lines_total += len(csv_authored_content) for line in talon_authored_content:
def test_line_starts_with_on(): msg_body = """Blah-blah-blah On blah-blah-blah""" eq_(msg_body, quotations.extract_from_plain(msg_body))
def test_pattern_date_email_with_unicode(): msg_body = """Replying ok 2011/4/7 Nathan \xd0\xb8ova <*****@*****.**> > Cool beans, scro""" eq_("Replying ok", quotations.extract_from_plain(msg_body))
def test_quotation_marker_false_positive(): msg_body = """Visit us now for assistance... >>> >>> http://www.domain.com <<< Visit our site by clicking the link above""" eq_(msg_body, quotations.extract_from_plain(msg_body))
def test_from_block_starts_with_date(): msg_body = """Blah Date: Wed, 16 May 2012 00:15:02 -0600 To: [email protected]""" eq_('Blah', quotations.extract_from_plain(msg_body))
def test_reply_quotations_share_block(): stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK) ok_(stripped_html) ok_('From' not in stripped_html)
def test_preprocess_postprocess_2_links(): msg_body = "<http://link1> <http://link2>" eq_(msg_body, quotations.extract_from_plain(msg_body))