def test_signature_words(): msg_body = '''Hey! Thanks! Roman''' eq_(('Hey!', 'Thanks!\nRoman'), bruteforce.extract_signature(msg_body)) msg_body = '''Hey! -- Best regards, Roman''' eq_(('Hey!', '--\nBest regards,\n\nRoman'), bruteforce.extract_signature(msg_body)) msg_body = '''Hey! -- -- Regards, Roman''' eq_(('Hey!', '--\n--\nRegards,\nRoman'), bruteforce.extract_signature(msg_body)) msg_body = '''Hey! Sincerely, Roman''' eq_(('Hey!', 'Sincerely,\nRoman'), bruteforce.extract_signature(msg_body)) msg_body = '''Hey! Take care, Roman''' eq_(('Hey!', 'Take care,\nRoman'), bruteforce.extract_signature(msg_body))
def test_blackberry_signature(): msg_body = """Heeyyoooo. Sent wirelessly from my BlackBerry device on the Bell network. Envoyé sans fil par mon terminal mobile BlackBerry sur le réseau de Bell.""" eq_(('Heeyyoooo.', msg_body[len('Heeyyoooo.\n'):]), bruteforce.extract_signature(msg_body)) msg_body = """Blah Enviado desde mi oficina móvil BlackBerry® de Telcel""" eq_(('Blah', 'Enviado desde mi oficina móvil BlackBerry® de Telcel'), bruteforce.extract_signature(msg_body))
def test_signature_separated_by_long_dashes(): msg_body = '''Wow. Awesome! — Bob Smith''' eq_(('Wow. Awesome!', '—\nBob Smith'), bruteforce.extract_signature(msg_body)) msg_body = '''Wow. Awesome! —— Bob Smith''' eq_(('Wow. Awesome!', '——\nBob Smith'), bruteforce.extract_signature(msg_body))
def test_blackberry_signature(): msg_body = """Heeyyoooo. Sent wirelessly from my BlackBerry device on the Bell network. Envoyé sans fil par mon terminal mobile BlackBerry sur le réseau de Bell.""" eq_(('Heeyyoooo.', msg_body[len('Heeyyoooo.\n'):]), bruteforce.extract_signature(msg_body)) msg_body = u"""Blah Enviado desde mi oficina móvil BlackBerry® de Telcel""" eq_(('Blah', u'Enviado desde mi oficina móvil BlackBerry® de Telcel'), bruteforce.extract_signature(msg_body))
def test_blank_lines_inside_signature(): msg_body = '''Blah. -Lev. Sent from my HTC smartphone!''' eq_(('Blah.', '-Lev.\n\nSent from my HTC smartphone!'), bruteforce.extract_signature(msg_body)) msg_body = '''Blah -- John Doe''' eq_(('Blah', '--\n\nJohn Doe'), bruteforce.extract_signature(msg_body))
def extract_salutation(mail: mailparser.MailParser) -> str: """Extract the salutation from the email as a human would read it.""" result = extract_signature(extract_message(mail))[1] if result is None: result = talon.signature.extract(extract_message(mail), mail.from_[0][1])[1] return result
def email_pre_process(text, nlp, filters): text, _ = extract_signature(text) text = remove_regex(text, filters['regex']) text = remove_text(text, filters['text']) text = spacy_pipeline(text, nlp) return text.strip()
def test_blank_lines_inside_signature(): msg_body = """Blah. -Lev. Sent from my HTC smartphone!""" eq_( ("Blah.", "-Lev.\n\nSent from my HTC smartphone!"), bruteforce.extract_signature(msg_body), ) msg_body = """Blah -- John Doe""" eq_(("Blah", "--\n\nJohn Doe"), bruteforce.extract_signature(msg_body))
def extract_signatures_rb(emails): from talon.signature.bruteforce import extract_signature items = [extract_signature(email) for email in emails] bodies = [body for body, _ in items] signatures = [str(signature) for _, signature in items] return bodies, signatures
def strip_sig_footer(bodytext): no_footers = strip_footers(bodytext, False) #talon bruteforce technique to extract signature content, sig = extract_signature(no_footers) return content
def test_line_starts_with_signature_word(): msg_body = '''Hey man! Thanks for your attention. -- Thanks! Roman''' eq_(('Hey man!\nThanks for your attention.', '--\nThanks!\nRoman'), bruteforce.extract_signature(msg_body))
def test_line_starts_with_signature_word(): msg_body = """Hey man! Thanks for your attention. -- Thanks! Roman""" eq_( ("Hey man!\nThanks for your attention.", "--\nThanks!\nRoman"), bruteforce.extract_signature(msg_body), )
def test_signature_separated_by_dashes(): msg_body = """Hey man! How r u? --- Roman""" eq_(("Hey man! How r u?", "---\nRoman"), bruteforce.extract_signature(msg_body)) msg_body = """Hey! -roman""" eq_(("Hey!", "-roman"), bruteforce.extract_signature(msg_body)) msg_body = """Hey! - roman""" eq_(("Hey!", "- roman"), bruteforce.extract_signature(msg_body)) msg_body = """Wow. Awesome! -- Bob Smith""" eq_(("Wow. Awesome!", "--\nBob Smith"), bruteforce.extract_signature(msg_body))
def test_signature_max_lines_ignores_empty_lines(): msg_body = """Thanks, Blah regards John Doe""" eq_(('Thanks,\nBlah', 'regards\n\n\nJohn Doe'), bruteforce.extract_signature(msg_body))
def test_signature_cant_start_from_first_line(): msg_body = """Thanks, Blah regards John Doe""" eq_(('Thanks,\n\nBlah', 'regards\n\nJohn Doe'), bruteforce.extract_signature(msg_body))
def test_line_starts_with_dashes(): msg_body = '''Hey man! Look at this: --> one --> two -- Roman''' eq_(('Hey man!\nLook at this:\n\n--> one\n--> two', '--\nRoman'), bruteforce.extract_signature(msg_body))
def test_signature_words(): msg_body = """Hey! Thanks! Roman""" eq_(("Hey!", "Thanks!\nRoman"), bruteforce.extract_signature(msg_body)) msg_body = """Hey! -- Best regards, Roman""" eq_(("Hey!", "--\nBest regards,\n\nRoman"), bruteforce.extract_signature(msg_body)) msg_body = """Hey! -- -- Regards, Roman""" eq_(("Hey!", "--\n--\nRegards,\nRoman"), bruteforce.extract_signature(msg_body))
def test_signature_separated_by_dashes(): msg_body = '''Hey man! How r u? --- Roman''' eq_(('Hey man! How r u?', '---\nRoman'), bruteforce.extract_signature(msg_body)) msg_body = '''Hey! -roman''' eq_(('Hey!', '-roman'), bruteforce.extract_signature(msg_body)) msg_body = '''Hey! - roman''' eq_(('Hey!', '- roman'), bruteforce.extract_signature(msg_body)) msg_body = '''Wow. Awesome! -- Bob Smith''' eq_(('Wow. Awesome!', '--\nBob Smith'), bruteforce.extract_signature(msg_body))
def test_line_starts_with_dashes(): msg_body = """Hey man! Look at this: --> one --> two -- Roman""" eq_( ("Hey man!\nLook at this:\n\n--> one\n--> two", "--\nRoman"), bruteforce.extract_signature(msg_body), )
def extract_mail(input): input = input.replace("=", "-") input = input.replace("*", "-") input = input.replace("_", "-") text, signature = extract_signature(input) if type(signature) is "string": print("Detected sig") print(signature) if type(signature) is "string" and detect_verb(signature) == True: print("Signature has V") return input return text
def test_signature_words(): msg_body = '''Hey! Thanks! Roman''' eq_(('Hey!', 'Thanks!\nRoman'), bruteforce.extract_signature(msg_body)) msg_body = '''Hey! -- Best regards, Roman''' eq_(('Hey!', '--\nBest regards,\n\nRoman'), bruteforce.extract_signature(msg_body)) msg_body = '''Hey! -- -- Regards, Roman''' eq_(('Hey!', '--\n--\nRegards,\nRoman'), bruteforce.extract_signature(msg_body))
def remove_signature(message): msge = message.split('\n') # msg = list(filter(lambda a: a != '', msg)) # msg = list(filter(lambda a: a != ' ', msg)) # print msg try: msg = [x.rstrip() for x in msge] except: pass message = '\n'.join(msg) if '--' in msg: text, signature = extract_signature(message) else: text, signature = sig.extract(message, sender='*****@*****.**') return text
def preprocess(emails): """ Performs preprocessing operations such as: 1. Removing signature lines (only English emails are supported) 2. Removing new line characters. """ n_emails = len(emails) for i in range(n_emails): email = emails[i] email, _ = extract_signature(email) lines = email.split('\n') for j in reversed(range(len(lines))): lines[j] = lines[j].strip() if lines[j] == '': lines.pop(j) emails[i] = ' '.join(lines)
def test_signature_line_too_long_ignores_urls(): msg_body = """Thanks, this is a test -- Testy McTesterson CEO, Test, Inc. 100 Test St, Ste 100 | Austin, TX 78701 <https://maps.google.com/?q=100+Test+St,+Ste+100+%7C+Austin,+TX+78701&entry=gmail&source=g> """ eq_(('Thanks,\n\nthis is a test',"""\ -- Testy McTesterson CEO, Test, Inc. 100 Test St, Ste 100 | Austin, TX 78701 <https://maps.google.com/?q=100+Test+St,+Ste+100+%7C+Austin,+TX+78701&entry=gmail&source=g>"""), bruteforce.extract_signature(msg_body))
def clean_body(self, mail_body): """Contains several email cleaning procedures.""" delimiters = ["-----Original Message-----", "To:", "From"] # Split body by earliest appearing delimiter, with delimiters being indicators of the start of an email being forwarded. old_len = sys.maxsize for delimiter in delimiters: split_body = mail_body.split(delimiter, 1) new_len = len(split_body[0]) if new_len <= old_len: old_len = new_len final_split = split_body if (len(final_split) == 1): mail_chain = None else: mail_chain = final_split[1] # The following uses Talon library to try to extract a clean body from signatures of the remaining email body. clean_body, sig = extract_signature(final_split[0]) return {'body': clean_body, 'chain': mail_chain, 'signature': sig}
def parse_reply(filename): """ Extract body contents from reply, stripping away html tags. Args: filename: str, full path of .raw.html file """ with open(filename, 'r') as f: raw = f.read() title = parse_reply_title(raw) bodyhtml, bodytext = parse_reply_body(raw) #talon bruteforce technique to extract signature content, sig = extract_signature(bodytext) body_filename = filename.replace('.raw.html', '.reply.body.txt') with open(body_filename, 'w') as w: w.write(bodytext) title_body_filename = filename.replace('.raw.html', '.reply.title_body.txt') with open(title_body_filename, 'w') as w: w.write(title) w.write(bodytext) body_no_sig_filename = filename.replace('.raw.html', '.reply.body_no_signature.txt') with open(body_no_sig_filename, 'w') as w: w.write(content) title_body_no_sig_filename = filename.replace( '.raw.html', '.reply.title_body_no_signature.txt') with open(title_body_no_sig_filename, 'w') as w: w.write(title) w.write(content) #parse tags tag_data = parse_reply_tags(bodyhtml) body_tags_filename = filename.replace('.raw.html', '.reply.body_tags.txt') with open(body_tags_filename, 'w') as w: w.write(json.dumps(tag_data))
def __init__(self, email_string): """ Takes a raw email string and processes it into something useful """ self.str = email_string self.raw = mime.from_string(self.str) to = self.raw.headers['To'] if to is None: self.recipients = [] else: to = to.lower() self.recipients = address.parse_list(to) if ',' in to else [address.parse(to)] # It's possible a recipient is None if it is something like # 'Undisclosed recipients:;' self.recipients = [r for r in self.recipients if r is not None] self.sender = address.parse(self.raw.headers['From'].lower()) self.subject = self.raw.subject self.id = self.raw.message_id self.date = parse(self.raw.headers['Date']) self.content_encoding = self.raw.content_encoding[0] # Extract plaintext body if self.raw.content_type.is_singlepart(): self.full_body = self.raw.body elif self.raw.content_type.is_multipart(): for p in self.raw.parts: if p.content_type == 'text/plain': self.full_body = p.body break # Try to get signature self.body, self.signature = extract_signature(self.full_body) # Try ML approach if necessary if self.signature is None: self.body, self.signature = signature.extract(self.full_body, sender=self.sender) # Get replies only, not the quotes self.body = quotations.extract_from(self.body, 'text/plain')
def remove_signature(message): '''takes the message as string and returns a string removing signature ''' msge = message.split('\n') # break the message in line try: msg = [x.rstrip() for x in msge] # remove the white spaces except: pass # otherwise proceed message = '\n'.join(msg) # form a string from the msg list if '--' in msg: # method 1 by checking the occurence of '--' in msg list text, signature = extract_signature( message) # extract the filtered text and signature else: #METHOD 2 by using ML way text, signature = sig.extract( message, sender='*****@*****.**' ) # extract the text and signature, the sender argument is necessary but any fake email id can be used return text
def parse_email_quotes(): """ Run through each file in archive and add 'clean_body' and 'signature' to each email's information. """ talon.init() archive_dir = "archive/" for filenum, filename in enumerate(os.listdir(archive_dir)): if filenum % 1000 == 0: print filenum if filename.endswith(".email.json"): full_filename = os.path.join(archive_dir, filename) fh = open(full_filename, "r") email_data = load(fh) fh.close() if not "clean_body" in email_data or not 'signature' in email_data: reply_body = naive_quote_removal(email_data['body']) email_data['clean_body'], email_data['signature'] = extract_signature(reply_body) fh = open(full_filename, "w") fh.write(dumps(email_data)) fh.close()
def get_cleaned_email(parsed_email): text_part = parsed_email.text_plain[0] if parsed_email.text_plain else None html_part = parsed_email.text_html[0] if parsed_email.text_html else None if not text_part and not html_part: return None, None if text_part: title_and_body = clean_email_text(text_part) title, body = title_and_body body, signature = extract_signature(body) # extract_signature seems to not support html code as input title_and_body = (title, body) print(f"striped out signature in the email: {signature}") # TODO optionally: if signature == None which may be # because it's not been recognized, apply additionally: # from talon import signature # body3, signature = signature.extract(body2, sender='*****@*****.**') else: title_and_body = clean_email_html(html_part) return title_and_body
def parse_email_quotes(): """ Run through each file in archive and add 'clean_body' and 'signature' to each email's information. """ talon.init() archive_dir = "archive/" for filenum, filename in enumerate(os.listdir(archive_dir)): if filenum % 1000 == 0: print filenum if filename.endswith(".email.json"): full_filename = os.path.join(archive_dir, filename) fh = open(full_filename, "r") email_data = load(fh) fh.close() if not "clean_body" in email_data or not 'signature' in email_data: reply_body = naive_quote_removal(email_data['body']) email_data['clean_body'], email_data[ 'signature'] = extract_signature(reply_body) fh = open(full_filename, "w") fh.write(dumps(email_data)) fh.close()
def process_sign(file): with open(file, 'r') as fp: lines = fp.readlines() stripped = [] sender = lines[2] for line in lines: remove = False for t in exclude_headers: if t in line: remove = True break if remove: continue stripped.append(line) email_stripped = ''.join(stripped) sender = sender.split(' ')[-1] msg, signature = extract_signature_ml(email_stripped, sender) if signature == None: msg, signature = extract_signature(email_stripped) tokens = [i.lower() for i in nltk.wordpunct_tokenize(msg + '[SIGN]')] return tokens
ac_lines_talon = 0.0 ac_lines_correct = 0.0 for message_id in dataset: csv_signature, csv_authored_content = csv_munge(message_id) # print message_id # print csv_signature # print csv_authored_content # print "-------" text = dataset[message_id] # find talon signatures results = extract_signature(text) if results[1]: talon_signature = results[1].split('\n') else: talon_signature = [] #find talon authored content talon_authored_content = quotations.extract_from_plain(text).split('\n') # do a comparative scoring of results found if len(talon_signature) > 0 or len(csv_signature) > 0: required = set(csv_signature) signature_lines_total += len(csv_signature) for line in talon_signature: if len(line) > 0: signature_lines_talon += 1.0
def extract_signatures_rb(emails): items = [extract_signature(email) for email in emails] bodies = [body for body, _ in items] signatures = [str(signature) for _, signature in items] return signatures
ac_lines_talon = 0.0 ac_lines_correct = 0.0 for message_id in dataset: csv_signature, csv_authored_content = csv_munge(message_id) # print message_id # print csv_signature # print csv_authored_content # print "-------" text = dataset[message_id] # find talon signatures results = extract_signature(text) if results[1]: talon_signature = results[1].split('\n') else: talon_signature = [] #find talon authored content talon_authored_content = quotations.extract_from_plain(text).split('\n') # do a comparative scoring of results found if len(talon_signature) > 0 or len(csv_signature) > 0: required = set(csv_signature) signature_lines_total += len(csv_signature) for line in talon_signature: if len(line) > 0:
def test_crash_in_extract_signature(): msg_body = '''Hey! -roman''' eq_((msg_body, None), bruteforce.extract_signature(msg_body))
def test_no_signature(): msg_body = 'Hey man!' eq_((msg_body, None), bruteforce.extract_signature(msg_body))
def test_signature_only(): msg_body = '--\nRoman' eq_((msg_body, None), bruteforce.extract_signature(msg_body))
def test_iphone_signature(): msg_body = '''Hey! Sent from my iPhone!''' eq_(('Hey!', 'Sent from my iPhone!'), bruteforce.extract_signature(msg_body))
def test_empty_body(): eq_(('', None), bruteforce.extract_signature(''))
def test_mailbox_for_iphone_signature(): msg_body = """Blah Sent from Mailbox for iPhone""" eq_(("Blah", "Sent from Mailbox for iPhone"), bruteforce.extract_signature(msg_body))
__author__ = 'a_medelyan' import talon from talon import quotations from talon.signature.bruteforce import extract_signature talon.init() text = "The price is still 91.87.\n\nKeoni Almeida\nCalifornia Independent System Operator\nphone: 916/608-7053\npager: 916/814-7352\nalpha page: [email protected]\ne-mail: <mailto:[email protected]>\n\n\n\n> -----Original Message-----\n> From:\tCRCommunications\n> Sent:\tFriday, June 22, 2001 11:34 AM\n> To:\tISO Market Participants\n> Subject:\tCAISO Notice: Update to June 20 Market Notice\n>\n> <<MARKET NOTICE 010622_.doc>>\n>\n> Market Participants:\n> Please read the attached explanation of Footnote 14 in the California ISO\n> June 20, 2001, Market Notice.\n>\n> CR Communications\n> Client Relations Communications\n\n - MARKET NOTICE 010622_.doc" reply = quotations.extract_from_plain(text) signature = extract_signature(text)[1] print "Reply: ", reply print "Signature: ", signature