def test_attachment_reconstruction(): raw_message = """Received: ConsoleMessageDelivery From: "(Secure) iOS Dev" <*****@*****.**> Content-Type: multipart/related; boundary="Apple-Mail=_B3D4A2AE-CBE5-47E8-86E4-5052190755A6"; type="text/plain" Subject: Message-Id: <*****@*****.**> Date: Mon, 27 Jun 2016 10:43:03 -0400 To: iOS Dev <*****@*****.**> Mime-Version: 1.0 (Mac OS X Mail 9.3 \(3124\)) --Apple-Mail=_B3D4A2AE-CBE5-47E8-86E4-5052190755A6 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset=us-ascii blah blah blah --Apple-Mail=_B3D4A2AE-CBE5-47E8-86E4-5052190755A6 Content-Transfer-Encoding: base64 Content-Disposition: dummy; filename="handle" Content-Type: dummy/dummy; name="handle" cGxhY2Vob2xkZXIgZm9yIGFuIGF0dGFjaG1lbnQ= """ attachment_content = randUnicode() raw_attachment = "Content-Type: text/plain; name=\"test.txt\"\r\nContent-Disposition: attachment; filename=\"test.txt\"\r\n\r\n{}".format(attachment_content) message = mime.from_string(raw_message) attachment = mime.from_string(raw_attachment) attachments = {"handle": attachment} status, restored_message = EmailV1.restoreAttachments(message, attachments) assert status assert len(restored_message.parts) == 2 assert restored_message.parts[1] == attachment
def emailParse(email, data): msg = mime.from_string(str(data).decode('utf-8').encode('ascii','ignore')) sender = msg.headers['From'] senderEmail = re.search(emailRegex,sender) if not senderEmail: print(sender) email.sender = senderEmail.group(0) email.subject = msg.headers['Subject'] date = msg.headers['Date'] if date.find('(') > -1: date = date[:date.find('(')] email.timeStamp = dateParser.parse(date) if msg.content_type.is_multipart(): for part in msg.parts: if part.content_type == 'text/plain': email.textPlain = cleanText(part.body) elif part.content_type == 'text/html': email.textHtml = htmlToText(part.body) elif part.content_type == 'application/pdf' or part.content_type == 'application/octet-stream': filename = part.headers['content-Disposition'][1]['filename'] emailUtil.writeFile(email.messageId, part.body, filename) email.attachments = email.attachments + filename + ',' elif msg.content_type.is_singlepart(): if msg.headers['Content-Type'] == 'text/plain': email.textPlain = cleanText(msg.body) elif msg.headers['Content-Type'] == 'text/html': email.textHtml = htmlToText(msg.body)
def _process_message(self, uid, mailbox, download_attachments=DEFAULT_DOWNLOAD_ATTACHMENTS): message = mailbox.mail(uid, include_raw=True) mime_msg = mime.from_string(message.raw) body = message.body sent_from = message.from_addr sent_to = message.to subject = message.title date = message.date message_id = message.message_id headers = mime_msg.headers.items() has_attachments = bool(message.attachments) # Flatten the headers so they can be unpickled headers = self._flattern_headers(headers=headers) payload = { 'uid': uid, 'from': sent_from, 'to': sent_to, 'headers': headers, 'date': date, 'subject': subject, 'message_id': message_id, 'body': body, 'has_attachments': has_attachments, 'attachments': [] } if has_attachments and download_attachments: self._logger.debug('[IMAPSensor]: Downloading attachments for message {}'.format(uid)) result = self._download_and_store_message_attachments(message=message) payload['attachments'] = result self._sensor_service.dispatch(trigger=self._trigger, payload=payload)
def process_attachement(attachment, detected_content_type, detected_file_name, origin_domain, passwordlist, sha): indicators = 0 payload_results = [] suspicious_urls = set() try: mpart_attachment = mime.from_string(attachment) if mpart_attachment.content_type.is_multipart(): for p in mpart_attachment.walk(): detected_content_type = str(p.detected_content_type) filename = detected_file_name ind, s_urls, payload_r = process_attachement(p.body, detected_content_type, filename, origin_domain, passwordlist, sha) indicators += ind suspicious_urls |= set(s_urls) payload_results += payload_r except DecodingError: # Binary attachement pass extract_urls = ExtractURL(attachment, origin_domain, sha) suspicious_urls |= set(extract_urls.processing()) indicators += extract_urls.indicators content_type = detected_content_type filename = detected_file_name if filename is not None and len(filename) > 0: passwordlist.append(filename) prefix, suffix = os.path.splitext(filename) passwordlist.append(prefix) passwordlist = [i for i in passwordlist if len(i) > 1] r_indicators, is_archive, r = process_payload(filename, attachment, content_type, origin_domain, passwordlist, sha) r['filename'] = filename r['content_type'] = content_type indicators += r_indicators payload_results.append(r) return indicators, list(suspicious_urls), is_archive, payload_results
def test_reply_quotations_share_block(): msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK) html_part = list(msg.walk())[1] assert html_part.content_type == 'text/html' stripped_html = quotations.extract_from_html(html_part.body) ok_(stripped_html) ok_('From' not in stripped_html)
def _extract_parts(namespace_id, folder_id, body_string): data_sha256 = sha256(body_string).hexdigest() if not is_in_blockstore(data_sha256): save_to_blockstore(data_sha256, body_string) try: parsed = mime.from_string(body_string) except (mime.DecodingError, AttributeError, RuntimeError, TypeError) as e: log.error('Error parsing message metadata', folder_id=folder_id, namespace_id=namespace_id, error=e) return if parsed is None: return for mimepart in parsed.walk( with_self=parsed.content_type.is_singlepart()): try: if mimepart.content_type.is_multipart(): continue # TODO should we store relations? _parse_mimepart(namespace_id, mimepart) except (mime.DecodingError, AttributeError, RuntimeError, TypeError, binascii.Error, UnicodeDecodeError) as e: log.error('Error parsing message MIME parts', folder_id=folder_id, namespace_id=namespace_id, exc_info=True) return
def test_standard_replies(filename): def check_part(email_part): text = email_part.body parsed = quotations.extract_from_plain(text) reply_text_fn = filename[:-4] + '_reply_text' if os.path.isfile(reply_text_fn): with open(reply_text_fn) as f: expected_text = f.read() else: expected_text = 'Hello' assert parsed == expected_text, 'Parsed text was incorrect for file {0}'.format( filename ) with open(filename) as f: msg = f.read() m = mime.from_string(msg) found_text_plain_part = False if m.content_type == 'text/plain': found_text_plain_part = True check_part(m) else: for part in m.walk(): if part.content_type == 'text/plain': found_text_plain_part = True check_part(part) if not found_text_plain_part: pytest.fail('Could not find text/plain part in email {0}'.format(filename))
def ingest(self, meta, local_path): with open(local_path, 'rb') as emlfh: data = emlfh.read() msg = mime.from_string(data) meta = self.parse_headers(msg, meta) body_type = 'text/plain' body_part = msg.body for part in msg.walk(): if not part.is_body(): self.ingest_attachment(part, meta) continue body = part.body if 'html' not in body_type and \ body is not None and len(body.strip()): body_type = unicode(part.detected_content_type) body_part = body out_path = '' if body_part is None: raise IngestorException("No body in E-Mail: %r" % meta) try: if 'html' in body_type: out_path = self.write_temp(body_part, 'htm') ing = HtmlIngestor(self.source_id) else: out_path = self.write_temp(body_part, 'txt') ing = DocumentIngestor(self.source_id) ing.ingest(meta, out_path) finally: remove_tempfile(out_path)
def test_quoted_printable_encoding_avoided_for_compatibility( patch_smtp, api_client): # Test that messages with long lines don't get quoted-printable encoded, # for maximum server compatibility. api_client.post_data( '/send', {'to': [{'email': '*****@*****.**'}], 'subject': 'In Catilinam', 'body': 'Etenim quid est, Catilina, quod iam amplius exspectes, si ' 'neque nox tenebris obscurare coeptus nefarios neque privata domus ' 'parietibus continere voces conjurationis tuae potest? Si ' 'illustrantur, si erumpunt omnia? Muta iam istam mentem, mihi crede! ' 'obliviscere caedis atque incendiorum. Teneris undique: luce sunt ' 'clariora nobis tua consilia omnia; quae iam mecum licet recognoscas.' ' Meministine me ante diem duodecimum Kalendas Novembres dicere in ' 'senatu, fore in armis certo die, qui dies futurus esset ante diem ' 'sextum Kalendas Novembres, C. Manlium, audaciae satellitem atque ' 'administrum tuae? Num me fefellit, Catilina, non modo res tanta, tam' ' atrox, tamque incredibilis, verum id quod multo magis admirandum, ' 'dies? '}) _, msg = patch_smtp[-1] parsed = mime.from_string(msg) assert len(parsed.parts) == 2 for part in parsed.parts: if part.content_type.value == 'text/html': assert part.content_encoding[0] == 'base64' elif part.content_type.value == 'text/plain': assert part.content_encoding[0] in ('7bit', 'base64')
def get_header(self, header, mid): if self.decode_error: log.warning('Error getting message header', mid=mid) return parsed = mime.from_string(self.full_body.data) return parsed.headers.get(header)
def test_send_with_event(patch_smtp, api_client, example_draft, event): # Create a draft r = api_client.post_data('/drafts', example_draft) msgs = patch_smtp assert r.status_code == 200 draft_public_id = json.loads(r.data)['id'] version = json.loads(r.data)['version'] # Send the draft along with an event ID to use for invites r = api_client.post_data('/send', {'draft_id': draft_public_id, 'version': version, 'event_id': event.public_id}) assert r.status_code == 200 # Make sure one message was sent assert len(msgs) == 1 recipients, raw_msg = msgs[0] msg = mime.from_string(raw_msg) # Check the MIME body of the message to make sure the event is there parts = [] for mimepart in msg.walk(with_self=msg.content_type.is_singlepart()): format_type = mimepart.content_type.format_type subtype = mimepart.content_type.subtype parts.append((format_type, subtype)) assert ('text', 'plain') in parts assert ('text', 'html') in parts assert ('text', 'calendar') in parts
def test_from_mime(): # create email from separated mime and test if it get reconstructed ok root_mime = mime.create.multipart("mixed") text_1 = mime.create.text("plain", randUnicode(length=3)) root_mime.append(text_1) attachments = [] for _ in range(2): a = mime.create.attachment("image/png", randStr(size=10), randUnicode(), AttachmentType.INLINE) attachments.append(a) a.to_string() root_mime.append(a) text_2 = mime.create.text("plain", randUnicode(length=3)) root_mime.append(text_2) for _ in range(3): a = mime.create.attachment("video/mp4", randStr(size=15), randUnicode(), AttachmentType.ATTACHMENT) attachments.append(a) a.to_string() root_mime.append(a) root_mime.headers["Message-Id"] = u"<{}>".format(EmailHelpers.newMessageId()) email = EmailV1.fromMime(root_mime.to_string(), [], {"user_id": u"*****@*****.**", "display_name": u"S B"}) # check if the attachments have been all separated properly body_mime = mime.from_string(email.body.content) assert len(attachments) == len(filter(lambda p: p.content_type.value == DUMMY_CONTENT_TYPE , body_mime.parts)) # check att hashes are properly inserted as filenames assert map(lambda a: HexEncode(Sha256Sum(a.to_string())), attachments) == map(lambda p: p.content_disposition[1]["filename"], filter(lambda p: p.content_type.value == DUMMY_CONTENT_TYPE , body_mime.parts))
def test_reply_headers_set(patch_smtp, api_client, example_draft): thread_id = api_client.get_data('/threads')[0]['id'] api_client.post_data('/send', {'to': [{'email': '*****@*****.**'}], 'thread_id': thread_id}) _, msg = patch_smtp[-1] parsed = mime.from_string(msg) assert 'In-Reply-To' in parsed.headers assert 'References' in parsed.headers
def returnText(email): msg = mime.from_string(str(email)) if msg.content_type.is_multipart(): for part in msg.parts: if part.content_type == 'text/plain': return part.body elif msg.content_type.is_singlepart(): if msg.headers['Content-Type'] == 'text/plain': return msg.body
def process_headers(sha, store): mailpath = os.path.join(store, sha) if os.path.exists(mailpath): with open(mailpath, 'rb') as f: mail = mime.from_string(f.read()) examine_headers = ExamineHeaders(mail, sha) origin_ip, rbl_listed, rbl_comment, mailfrom, mailto, origin_domain = examine_headers.processing() return (mail.subject, origin_ip, rbl_listed, rbl_comment, mailfrom, mailto, origin_domain, examine_headers.indicators)
def parse_message(self, mailfrom, rcpttos, data): message = mime.from_string(data) payload = { 'from': None, 'to': None, 'subject': None, 'date': None, 'body_plain': None, 'body_html': None, 'attachments': [], 'headers': message.headers.items(), } # Try to get the addressee via headers, or # fall-back to raw protocol request if 'To' in message.headers.keys(): payload['to'] = message.headers['To'] else: payload['to'] = rcpttos # Try to get the recipient via headers, or # fall-back to raw protocol request if 'From' in message.headers.keys(): payload['from'] = message.headers['From'] else: payload['from'] = mailfrom if 'Subject' in message.headers.keys(): payload['subject'] = message.headers['Subject'] if 'Date' in message.headers.keys(): payload['date'] = message.headers['Date'] # Body if message.content_type.is_singlepart(): payload['body_plain'] = message.body elif message.content_type.is_multipart(): for part in message.parts: content_type = part.content_type[0] if content_type == 'text/plain': payload['body_plain'] = part.body elif content_type == 'text/html': payload['body_html'] = part.body elif part.is_attachment(): attachment = { 'filename': part.detected_file_name, 'md5': hashlib.md5(part.body).hexdigest(), 'sha1': hashlib.sha1(part.body).hexdigest(), 'data': base64.b64encode(part.body), 'encoding': part.content_encoding[0], 'type': content_type, } payload['attachments'].append(attachment) return payload
def test_address_parsing_edge_cases(): """Check that header parsing can handle a variety of tricky input.""" # Extra quotes around display name mimepart = mime.from_string('From: ""Bob"" <*****@*****.**>') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [(' Bob ', '*****@*****.**')] # Comments after addr-spec mimepart = mime.from_string( 'From: "Bob" <*****@*****.**>(through Yahoo! Store Order System)') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [('Bob', '*****@*****.**')] mimepart = mime.from_string( 'From: Indiegogo <*****@*****.**> (no reply)') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [('Indiegogo', '*****@*****.**')] mimepart = mime.from_string( 'From: Anon <*****@*****.**> (GitHub Staff)') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [('Anon', '*****@*****.**')] # Display name in comment mimepart = mime.from_string('From: root@gunks (Cron Daemon)') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [('Cron Daemon', 'root@gunks')] # Missing closing angle bracket mimepart = mime.from_string('From: Bob <*****@*****.**') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [('Bob', '*****@*****.**')] # Blank (spammers) mimepart = mime.from_string('From: ()') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [] # Missing header mimepart = mime.from_string('') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [] # Duplicate header mimepart = mime.from_string('From: [email protected]\r\n' 'From: [email protected]') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [('', '*****@*****.**')]
def test_draft_updates(db, default_account, mock_imapclient): # Set up folder list mock_imapclient._data['Drafts'] = {} mock_imapclient._data['Trash'] = {} mock_imapclient.list_folders = lambda: [ (('\\HasNoChildren', '\\Drafts'), '/', 'Drafts'), (('\\HasNoChildren', '\\Trash'), '/', 'Trash') ] pool = writable_connection_pool(default_account.id) draft = create_message_from_json({'subject': 'Test draft'}, default_account.namespace, db.session, True) draft.is_draft = True draft.version = 0 db.session.commit() save_draft(default_account.id, draft.id, {'version': 0}) with pool.get() as conn: conn.select_folder('Drafts', lambda *args: True) assert len(conn.all_uids()) == 1 # Check that draft is not resaved if already synced. update_draft(default_account.id, draft.id, {'version': 0}) with pool.get() as conn: conn.select_folder('Drafts', lambda *args: True) assert len(conn.all_uids()) == 1 # Check that an older version is deleted draft.version = 4 sendmail_update_draft(db.session, default_account, draft, from_addr=draft.from_addr, subject='New subject', blocks=[]) db.session.commit() update_draft(default_account.id, draft.id, {'version': 5}) with pool.get() as conn: conn.select_folder('Drafts', lambda *args: True) all_uids = conn.all_uids() assert len(all_uids) == 1 data = conn.uids(all_uids)[0] parsed = mime.from_string(data.body) expected_message_id = '<{}-{}@mailer.nylas.com>'.format( draft.public_id, draft.version) assert parsed.headers.get('Message-Id') == expected_message_id delete_draft(default_account.id, draft.id, {'message_id_header': draft.message_id_header, 'inbox_uid': draft.inbox_uid, 'version': 5}) with pool.get() as conn: conn.select_folder('Drafts', lambda *args: True) all_uids = conn.all_uids() assert len(all_uids) == 0
def test_sending_from_email_alias(patch_smtp, api_client): api_client.post_data('/send', {'to': [{'email': '*****@*****.**'}], 'from': [{'name': 'admin', 'email': '*****@*****.**'}], 'subject': 'Banalities', 'body': '<html>Hello there</html>'}) _, msg = patch_smtp[-1] parsed = mime.from_string(msg) assert 'From' in parsed.headers assert parsed.headers['From'] == 'admin <*****@*****.**>'
def load_from_stdin(self, logger): ''' Load email from standard input ''' logger.info("%s: loading email from stdin" % self.uuid) email_string = "" for line in sys.stdin: email_string += line self.raw = mime.from_string(email_string) self.raw.headers.add("X-Capkopper-Filter-UUID", self.uuid)
def get(self, request, *args, **kwargs): params = {"access_token": request.GET.get("access_token")} if request.GET.get("maxResults"): params["maxResults"] = request.GET.get("maxResults") if request.GET.get("pageToken"): params["pageToken"] = request.GET.get("pageToken") if request.GET.get("q"): params["q"] = request.GET.get("q") r = requests.get( "https://www.googleapis.com/gmail/v1/users/me/threads/{}".format(kwargs.get("threadId")), params=params ) if r.status_code == 200: ans = r.json() msgs = [] for m in r.json()["messages"]: mr = requests.get( "https://www.googleapis.com/gmail/v1/users/me/messages/{}".format(m["id"]), params={"access_token": request.GET.get("access_token"), "format": request.GET.get("format")}, ) if mr.status_code == 200: ans_msg = { "id": m["id"], "opened": True if cache.get(m["id"]) else False, "snippet": mr.json()["snippet"], } if request.GET.get("format"): msg_raw = str(mr.json()["raw"]) if request.GET.get("decode"): msg = mime.from_string(base64.urlsafe_b64decode(msg_raw)) if msg.content_type.is_multipart(): for part in msg.parts: if part.content_type == "text/plain": part.body = self.msg_filter(part.body) break if part.content_type == "text/html": part.body = self.msg_filter(part.body) else: msg.body = self.msg_filter(msg.body) ans_msg["debug"] = mime.python_message_to_string(msg.to_python_message()) ans_msg["raw"] = mime.python_message_to_string(msg.to_python_message()) ans_msg["raw"] = base64.urlsafe_b64encode(ans_msg["raw"]) else: ans_msg["raw"] = msg_raw else: if "parts" in mr.json()["payload"]: self.parse_parts(ans_msg, mr.json()["payload"]["parts"]) else: self.parse_parts(ans_msg, [mr.json()["payload"]]) msgs.append(ans_msg) ans["messages"] = msgs return HttpResponse(json.dumps(ans), content_type="application/json") else: return HttpResponse(r.text, content_type="application/json", status=r.status_code)
def test_reply_headers_set(db, patch_smtp, api_client, example_draft, thread, message): message.message_id_header = '<*****@*****.**>' db.session.commit() thread_id = api_client.get_data('/threads')[0]['id'] api_client.post_data('/send', {'to': [{'email': '*****@*****.**'}], 'thread_id': thread_id}) _, msg = patch_smtp[-1] parsed = mime.from_string(msg) assert 'In-Reply-To' in parsed.headers assert 'References' in parsed.headers
def replaceDummyReferences(message, reference_map): if not isinstance(message, mime.message.part.MimePart): return False, None for part in message.walk(with_self=True): if part.content_type == DUMMY_CONTENT_TYPE: t, o = part.content_disposition filename = o.get("filename") if filename in reference_map: part.content_disposition.params["filename"] = reference_map[filename] # HACK: Must be set on the MIMEPart that has been modified part.was_changed = types.MethodType(was_changed_always, part) message.was_changed = types.MethodType(was_changed_always, message) return True, mime.from_string(message.to_string())
def data(self): if self.size == 0: log.warning("Block size is 0") return "" elif hasattr(self, "_data"): # On initial download we temporarily store data in memory value = self._data else: value = get_from_blockstore(self.data_sha256) if value is None: log.warning("Couldn't find data on S3 for block with hash {}".format(self.data_sha256)) from inbox.models.block import Block if isinstance(self, Block): if self.parts: # This block is an attachment of a message that was # accidentially deleted. We will attempt to fetch the raw # message and parse out the needed attachment. message = self.parts[0].message # only grab one raw_mime = get_from_blockstore(message.data_sha256) if raw_mime is None: log.error("Don't have raw message for hash {}".format(message.data_sha256)) return None parsed = mime.from_string(raw_mime) if parsed is not None: for mimepart in parsed.walk(with_self=parsed.content_type.is_singlepart()): if mimepart.content_type.is_multipart(): continue # TODO should we store relations? data = mimepart.body if isinstance(data, unicode): data = data.encode("utf-8", "strict") # Found it! if sha256(data).hexdigest() == self.data_sha256: log.info("Found subpart with hash {}".format(self.data_sha256)) save_to_blockstore(self.data_sha256, data) return data log.error("No data returned!") return value assert self.data_sha256 == sha256(value).hexdigest(), "Returned data doesn't match stored hash!" return value
def toMime(self): if not self.body.isLoaded() or (len(self.attachments) > 0 and any([not attachment.isLoaded() for attachment in self.attachments])): raise EmailException(u"EmailV2.toMime: All content must be loaded!") body = EmailHelpers.deserializeBody(self.body.content) time = None if not isinstance(self.server_attr, NOT_ASSIGNED): time = self.server_attr.server_time raw_mime = createMime(body["text"], body["html"], self.attachments, self.message_id, time, self.subject, self.tos, self.ccs, self.bccs, self.reply_tos, self.sender, self.in_reply_to, self.references) for key, value in self.other_headers.iteritems(): raw_mime.headers[key] = value return mime.from_string(raw_mime.to_string())
def setMIMEBcc(message, bccs): if not isinstance(message, mime.message.part.MimePart): return False, None if not isinstance(bccs, list): return False, None for bcc in bccs: if not isinstance(bcc, dict): return False, None if not isinstance(bcc.get("user_id"), unicode) or not isinstance(bcc.get("display_name"), unicode): return False, None if len(bccs) == 0: message.remove_headers("Bcc") else: message.headers["Bcc"] = u"{}".format(", ".join([u"{} <{}>".format(bcc["display_name"], bcc["user_id"]) for bcc in bccs])) return True, mime.from_string(message.to_string())
def test_bcc_in_recipients_but_stripped_from_headers(patch_smtp, api_client): r = api_client.post_data( '/send', { 'to': [{'email': '*****@*****.**'}], 'cc': [{'email': '*****@*****.**'}], 'bcc': [{'email': '*****@*****.**'}], 'subject': 'Banalities' }) assert r.status_code == 200 recipients, msg = patch_smtp[0] assert set(recipients) == {'*****@*****.**', '*****@*****.**', '*****@*****.**'} parsed = mime.from_string(msg) assert 'Bcc' not in parsed.headers assert parsed.headers.get('To') == '*****@*****.**' assert parsed.headers.get('Cc') == '*****@*****.**'
def test_inline_image_send(patch_smtp, api_client, uploaded_file_ids): file_id = uploaded_file_ids[0] r = api_client.post_data('/send', { 'subject': 'Inline image test', 'body': 'Before image\r\n[cid:{}]\r\nAfter image'.format(file_id), 'file_ids': [file_id], 'to': [{'name': 'Foo Bar', 'email': '*****@*****.**'}] }) assert r.status_code == 200 _, msg = patch_smtp[-1] parsed = mime.from_string(msg) for mimepart in parsed.walk(): if mimepart.headers['Content-Type'] == 'image/jpeg': assert mimepart.headers['Content-Id'] == '<{}>'.format(file_id) assert mimepart.headers['Content-Disposition'][0] == 'inline'
def test_body_construction(patch_smtp, api_client): api_client.post_data('/send', {'to': [{'email': '*****@*****.**'}], 'subject': 'Banalities', 'body': '<html>Hello there</html>'}) _, msg = patch_smtp[-1] parsed = mime.from_string(msg) assert len(parsed.parts) == 2 plain_part_found = False html_part_found = False for part in parsed.parts: if part.content_type.value == 'text/plain': plain_part_found = True assert part.body.strip() == 'Hello there' elif part.content_type.value == 'text/html': html_part_found = True assert part.body.strip() == '<html>Hello there</html>' assert plain_part_found and html_part_found
def test_send_with_event_and_attachments(patch_smtp, api_client, example_draft, event, attachments): msgs = patch_smtp # Load and post file for attachment filename, path = attachments[0] data = {'file': (open(path, 'rb'), filename)} r = api_client.post_raw('/files', data=data) assert r.status_code == 200 attachment_id = json.loads(r.data)[0]['id'] # Add attachment to the new draft and post the draft example_draft['file_ids'] = [attachment_id] r = api_client.post_data('/drafts', example_draft) assert r.status_code == 200 returned_draft = json.loads(r.data) draft_public_id = returned_draft['id'] version = returned_draft['version'] # Send the draft along with an event ID to use for invites r = api_client.post_data('/send', {'draft_id': draft_public_id, 'version': version, 'event_id': event.public_id}) assert r.status_code == 200 # Make sure one message was sent assert len(msgs) == 1 recipients, raw_msg = msgs[0] msg = mime.from_string(raw_msg) # Check the MIME body of the message to make sure both the event and the # attachment are there parts = [] for mimepart in msg.walk(with_self=msg.content_type.is_singlepart()): is_attachment = mimepart.is_attachment() format_type = mimepart.content_type.format_type subtype = mimepart.content_type.subtype parts.append((format_type, subtype, is_attachment)) assert ('text', 'plain', False) in parts assert ('text', 'html', False) in parts assert ('text', 'calendar', False) in parts assert ('image', 'jpeg', True) in parts
def test_body_construction(patch_smtp, api_client): api_client.post_data( "/send", { "to": [{ "email": "*****@*****.**" }], "subject": "Banalities", "body": "<html>Hello there</html>", }, ) _, msg = patch_smtp[-1] parsed = mime.from_string(msg) assert len(parsed.parts) == 2 plain_part_found = False html_part_found = False for part in parsed.parts: if part.content_type.value == "text/plain": plain_part_found = True assert part.body.strip() == "Hello there" elif part.content_type.value == "text/html": html_part_found = True assert part.body.strip() == "<html>Hello there</html>" assert plain_part_found and html_part_found
def process_message(source, message_text): from flanker import mime metadata_dictionary = {} message = mime.from_string(force_bytes(message_text)) if source.from_metadata_type: metadata_dictionary[ source.from_metadata_type.name] = message.headers.get('From') if source.subject_metadata_type: metadata_dictionary[source.subject_metadata_type. name] = message.headers.get('Subject') document_ids, parts_metadata_dictionary = EmailBaseModel._process_message( source=source, message=message) metadata_dictionary.update(parts_metadata_dictionary) if metadata_dictionary: for document in Document.objects.filter(id__in=document_ids): set_bulk_metadata(document=document, metadata_dictionary=metadata_dictionary)
def test_address_parsing(): """Check that header parsing can handle a variety of tricky input.""" # Extra quotes around display name mimepart = mime.from_string('From: ""Bob"" <*****@*****.**>') parsed = parse_mimepart_address_header(mimepart, "From") assert parsed == [[" Bob ", "*****@*****.**"]] # Comments after addr-spec mimepart = mime.from_string( 'From: "Bob" <*****@*****.**>(through Yahoo! Store Order System)') parsed = parse_mimepart_address_header(mimepart, "From") assert parsed == [["Bob", "*****@*****.**"]] mimepart = mime.from_string( "From: Indiegogo <*****@*****.**> (no reply)") parsed = parse_mimepart_address_header(mimepart, "From") assert parsed == [["Indiegogo", "*****@*****.**"]] mimepart = mime.from_string( "From: Anon <*****@*****.**> (GitHub Staff)") parsed = parse_mimepart_address_header(mimepart, "From") assert parsed == [["Anon", "*****@*****.**"]] # Display name in comment mimepart = mime.from_string("From: root@gunks (Cron Daemon)") parsed = parse_mimepart_address_header(mimepart, "From") assert parsed == [["Cron Daemon", "root@gunks"]] # Missing closing angle bracket mimepart = mime.from_string("From: Bob <*****@*****.**") parsed = parse_mimepart_address_header(mimepart, "From") assert parsed == [["Bob", "*****@*****.**"]] # Blank (spammers) mimepart = mime.from_string("From: ()") parsed = parse_mimepart_address_header(mimepart, "From") assert parsed == [] # Missing header mimepart = mime.from_string("") parsed = parse_mimepart_address_header(mimepart, "From") assert parsed == [] # Duplicate header mimepart = mime.from_string("From: [email protected]\r\n" "From: [email protected]") parsed = parse_mimepart_address_header(mimepart, "From") assert parsed == [["", "*****@*****.**"]] # RFC2047-encoded phrases with commas mimepart = mime.from_string( "From: =?utf-8?Q?Foo=2C=20Corp.?= <*****@*****.**>") parsed = parse_mimepart_address_header(mimepart, "From") assert parsed == [["Foo, Corp.", "*****@*****.**"]] mimepart = mime.from_string( "To: =?utf-8?Q?Foo=2C=20Corp.?= <*****@*****.**>, " "=?utf-8?Q?Support?= <*****@*****.**>") parsed = parse_mimepart_address_header(mimepart, "To") assert parsed == [ ["Foo, Corp.", "*****@*****.**"], ["Support", "*****@*****.**"], ] # Multiple header lines mimepart = mime.from_string( "To: [email protected]\nSubject: Hello\nTo: [email protected]") parsed = parse_mimepart_address_header(mimepart, "To") assert parsed == [["", "*****@*****.**"], ["", "*****@*****.**"]]
def create_from_synced(cls, account, mid, folder_name, received_date, body_string): """ Parses message data and writes out db metadata and MIME blocks. Returns the new Message, which links to the new Part and Block objects through relationships. All new objects are uncommitted. Threads are not computed here; you gotta do that separately. Parameters ---------- mid : int The account backend-specific message identifier; it's only used for logging errors. raw_message : str The full message including headers (encoded). """ _rqd = [account, mid, folder_name, body_string] if not all([v is not None for v in _rqd]): raise ValueError( 'Required keyword arguments: account, mid, folder_name, ' 'body_string') # stop trickle-down bugs assert account.namespace is not None assert not isinstance(body_string, unicode) msg = Message() msg.data_sha256 = sha256(body_string).hexdigest() # Persist the raw MIME message to disk/ S3 save_to_blockstore(msg.data_sha256, body_string) # Persist the processed message to the database msg.namespace_id = account.namespace.id try: parsed = mime.from_string(body_string) # Non-persisted instance attribute used by EAS. msg.parsed_body = parsed msg._parse_metadata(parsed, body_string, received_date, account.id, folder_name, mid) except (mime.DecodingError, AttributeError, RuntimeError, TypeError) as e: parsed = None # Non-persisted instance attribute used by EAS. msg.parsed_body = '' log.error('Error parsing message metadata', folder_name=folder_name, account_id=account.id, error=e) msg._mark_error() if parsed is not None: plain_parts = [] html_parts = [] for mimepart in parsed.walk( with_self=parsed.content_type.is_singlepart()): try: if mimepart.content_type.is_multipart(): continue # TODO should we store relations? msg._parse_mimepart(mid, mimepart, account.namespace.id, html_parts, plain_parts) except (mime.DecodingError, AttributeError, RuntimeError, TypeError, binascii.Error, UnicodeDecodeError) as e: log.error('Error parsing message MIME parts', folder_name=folder_name, account_id=account.id, error=e) msg._mark_error() msg.calculate_body(html_parts, plain_parts) # Occasionally people try to send messages to way too many # recipients. In such cases, empty the field and treat as a parsing # error so that we don't break the entire sync. for field in ('to_addr', 'cc_addr', 'bcc_addr', 'references', 'reply_to'): value = getattr(msg, field) if json_field_too_long(value): log.error('Recipient field too long', field=field, account_id=account.id, folder_name=folder_name, mid=mid) setattr(msg, field, []) msg._mark_error() return msg
def process_message(source, message_text, message_properties=None): from flanker import mime counter = 1 message = mime.from_string(force_bytes(message_text)) metadata_dictionary = {} if not message_properties: message_properties = {} message_properties['Subject'] = message_properties.get( 'Subject', message.headers.get('Subject')) message_properties['From'] = message_properties.get( 'From', message.headers.get('From')) if source.subject_metadata_type: metadata_dictionary[source.subject_metadata_type. name] = message_properties.get('Subject') if source.from_metadata_type: metadata_dictionary[source.from_metadata_type. name] = message_properties.get('From') # Messages are tree based, do nested processing of message parts until # a message with no children is found, then work out way up. if message.parts: for part in message.parts: EmailBaseModel.process_message( source=source, message_text=part.to_string(), message_properties=message_properties) else: # Treat inlines as attachments, both are extracted and saved as # documents if message.is_attachment() or message.is_inline(): # Reject zero length attachments if len(message.body) == 0: return label = message.detected_file_name or 'attachment-{}'.format( counter) with ContentFile(content=message.body, name=label) as file_object: if label == source.metadata_attachment_name: metadata_dictionary = yaml.load( stream=file_object.read(), Loader=SafeLoader) logger.debug('Got metadata dictionary: %s', metadata_dictionary) else: documents = source.handle_upload( document_type=source.document_type, file_object=file_object, expand=(source.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y)) if metadata_dictionary: for document in documents: set_bulk_metadata( document=document, metadata_dictionary=metadata_dictionary) else: # If it is not an attachment then it should be a body message part. # Another option is to use message.is_body() if message.detected_content_type == 'text/html': label = 'email_body.html' else: label = 'email_body.txt' if source.store_body: with ContentFile(content=force_bytes(message.body), name=label) as file_object: documents = source.handle_upload( document_type=source.document_type, expand=SOURCE_UNCOMPRESS_CHOICE_N, file_object=file_object) if metadata_dictionary: for document in documents: set_bulk_metadata( document=document, metadata_dictionary=metadata_dictionary)
def test_address_parsing(): """Check that header parsing can handle a variety of tricky input.""" # Extra quotes around display name mimepart = mime.from_string('From: ""Bob"" <*****@*****.**>') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [[' Bob ', '*****@*****.**']] # Comments after addr-spec mimepart = mime.from_string( 'From: "Bob" <*****@*****.**>(through Yahoo! Store Order System)') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [['Bob', '*****@*****.**']] mimepart = mime.from_string( 'From: Indiegogo <*****@*****.**> (no reply)') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [['Indiegogo', '*****@*****.**']] mimepart = mime.from_string( 'From: Anon <*****@*****.**> (GitHub Staff)') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [['Anon', '*****@*****.**']] # Display name in comment mimepart = mime.from_string('From: root@gunks (Cron Daemon)') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [['Cron Daemon', 'root@gunks']] # Missing closing angle bracket mimepart = mime.from_string('From: Bob <*****@*****.**') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [['Bob', '*****@*****.**']] # Blank (spammers) mimepart = mime.from_string('From: ()') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [] # Missing header mimepart = mime.from_string('') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [] # Duplicate header mimepart = mime.from_string('From: [email protected]\r\n' 'From: [email protected]') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [['', '*****@*****.**']] # RFC2047-encoded phrases with commas mimepart = mime.from_string( 'From: =?utf-8?Q?Foo=2C=20Corp.?= <*****@*****.**>') parsed = parse_mimepart_address_header(mimepart, 'From') assert parsed == [['Foo, Corp.', '*****@*****.**']] mimepart = mime.from_string( 'To: =?utf-8?Q?Foo=2C=20Corp.?= <*****@*****.**>, ' '=?utf-8?Q?Support?= <*****@*****.**>') parsed = parse_mimepart_address_header(mimepart, 'To') assert parsed == [['Foo, Corp.', '*****@*****.**'], ['Support', '*****@*****.**']] # Multiple header lines mimepart = mime.from_string( 'To: [email protected]\nSubject: Hello\nTo: [email protected]') parsed = parse_mimepart_address_header(mimepart, 'To') assert parsed == [['', '*****@*****.**'], ['', '*****@*****.**']]
def test_draft_updates(db, default_account, mock_imapclient): # Set up folder list mock_imapclient._data["Drafts"] = {} mock_imapclient._data["Trash"] = {} mock_imapclient._data["Sent Mail"] = {} mock_imapclient.list_folders = lambda: [ (("\\HasNoChildren", "\\Drafts"), "/", "Drafts"), (("\\HasNoChildren", "\\Trash"), "/", "Trash"), (("\\HasNoChildren", "\\Sent"), "/", "Sent Mail"), ] pool = writable_connection_pool(default_account.id) draft = create_message_from_json({"subject": "Test draft"}, default_account.namespace, db.session, True) draft.is_draft = True draft.version = 0 db.session.commit() with pool.get() as conn: save_draft(conn, default_account.id, draft.id, {"version": 0}) conn.select_folder("Drafts", lambda *args: True) assert len(conn.all_uids()) == 1 # Check that draft is not resaved if already synced. update_draft(conn, default_account.id, draft.id, {"version": 0}) conn.select_folder("Drafts", lambda *args: True) assert len(conn.all_uids()) == 1 # Check that an older version is deleted draft.version = 4 sendmail_update_draft( db.session, default_account, draft, from_addr=draft.from_addr, subject="New subject", blocks=[], ) db.session.commit() update_draft(conn, default_account.id, draft.id, {"version": 5}) conn.select_folder("Drafts", lambda *args: True) all_uids = conn.all_uids() assert len(all_uids) == 1 data = conn.uids(all_uids)[0] parsed = mime.from_string(data.body) expected_message_id = "<{}-{}@mailer.nylas.com>".format( draft.public_id, draft.version) assert parsed.headers.get("Message-Id") == expected_message_id # We're testing the draft deletion with Gmail here. However, # because of a race condition in Gmail's reconciliation algorithm, # we need to check if the sent mail has been created in the sent # folder. Since we're mocking everything, we have to create it # ourselves. mock_imapclient.append("Sent Mail", data.body, None, None, x_gm_msgid=4323) delete_draft( conn, default_account.id, draft.id, { "message_id_header": draft.message_id_header, "nylas_uid": draft.nylas_uid, "version": 5, }, ) conn.select_folder("Drafts", lambda *args: True) all_uids = conn.all_uids() assert len(all_uids) == 0
import mailbox from collections import defaultdict from flanker import mime from flanker.addresslib import address mbox_path = '...' mbox = mailbox.mbox(mbox_path) domains = defaultdict(int) items = mbox.iteritems() for msg in items: raw_msg = msg[1].as_string() parsed = mime.from_string(raw_msg) from_header = parsed.headers['From'] from_domain = address.parse(from_header).hostname domains[from_domain] += 1
def test_draft_updates(db, default_account, mock_imapclient): # Set up folder list mock_imapclient._data['Drafts'] = {} mock_imapclient._data['Trash'] = {} mock_imapclient._data['Sent Mail'] = {} mock_imapclient.list_folders = lambda: [ (('\\HasNoChildren', '\\Drafts'), '/', 'Drafts'), (('\\HasNoChildren', '\\Trash'), '/', 'Trash'), (('\\HasNoChildren', '\\Sent'), '/', 'Sent Mail'), ] pool = writable_connection_pool(default_account.id) draft = create_message_from_json({'subject': 'Test draft'}, default_account.namespace, db.session, True) draft.is_draft = True draft.version = 0 db.session.commit() with pool.get() as conn: save_draft(conn, default_account.id, draft.id, {'version': 0}) conn.select_folder('Drafts', lambda *args: True) assert len(conn.all_uids()) == 1 # Check that draft is not resaved if already synced. update_draft(conn, default_account.id, draft.id, {'version': 0}) conn.select_folder('Drafts', lambda *args: True) assert len(conn.all_uids()) == 1 # Check that an older version is deleted draft.version = 4 sendmail_update_draft(db.session, default_account, draft, from_addr=draft.from_addr, subject='New subject', blocks=[]) db.session.commit() update_draft(conn, default_account.id, draft.id, {'version': 5}) conn.select_folder('Drafts', lambda *args: True) all_uids = conn.all_uids() assert len(all_uids) == 1 data = conn.uids(all_uids)[0] parsed = mime.from_string(data.body) expected_message_id = '<{}-{}@mailer.nylas.com>'.format( draft.public_id, draft.version) assert parsed.headers.get('Message-Id') == expected_message_id # We're testing the draft deletion with Gmail here. However, # because of a race condition in Gmail's reconciliation algorithm, # we need to check if the sent mail has been created in the sent # folder. Since we're mocking everything, we have to create it # ourselves. mock_imapclient.append('Sent Mail', data.body, None, None, x_gm_msgid=4323) delete_draft( conn, default_account.id, draft.id, { 'message_id_header': draft.message_id_header, 'nylas_uid': draft.nylas_uid, 'version': 5 }) conn.select_folder('Drafts', lambda *args: True) all_uids = conn.all_uids() assert len(all_uids) == 0
def data(self): if self.size == 0: log.warning('Block size is 0') return '' elif hasattr(self, '_data'): # On initial download we temporarily store data in memory value = self._data else: value = blockstore.get_from_blockstore(self.data_sha256) if value is None: log.warning("Couldn't find data on S3 for block", sha_hash=self.data_sha256) from inbox.models.block import Block if isinstance(self, Block): if self.parts: # This block is an attachment of a message that was # deleted. We will attempt to fetch the raw # message and parse out the needed attachment. message = self.parts[0].message # only grab one account = message.namespace.account statsd_string = 'api.direct_fetching.{}.{}'.format( account.provider, account.id) # Try to fetch the message from S3 first. with statsd_client.timer( '{}.blockstore_latency'.format(statsd_string)): raw_mime = blockstore.get_from_blockstore( message.data_sha256) # If it's not there, get it from the provider. if raw_mime is None: statsd_client.incr( '{}.cache_misses'.format(statsd_string)) with statsd_client.timer( '{}.provider_latency'.format(statsd_string)): raw_mime = get_raw_from_provider(message) msg_sha256 = sha256(raw_mime).hexdigest() # Cache the raw message in the blockstore so that # we don't have to fetch it over and over. with statsd_client.timer( '{}.blockstore_save_latency'.format( statsd_string)): blockstore.save_to_blockstore(msg_sha256, raw_mime) else: # We found it in the blockstore --- report this. statsd_client.incr( '{}.cache_hits'.format(statsd_string)) # If we couldn't find it there, give up. if raw_mime is None: log.error("Don't have raw message for hash {}".format( message.data_sha256)) return None parsed = mime.from_string(raw_mime) if parsed is not None: for mimepart in parsed.walk( with_self=parsed.content_type.is_singlepart()): if mimepart.content_type.is_multipart(): continue # TODO should we store relations? data = mimepart.body if isinstance(data, unicode): data = data.encode('utf-8', 'strict') if data is None: continue # Found it! if sha256(data).hexdigest() == self.data_sha256: log.info('Found subpart with hash {}'.format( self.data_sha256)) with statsd_client.timer( '{}.blockstore_save_latency'.format( statsd_string)): blockstore.save_to_blockstore( self.data_sha256, data) return data log.error( "Couldn't find the attachment in the raw message", message_id=message.id) log.error('No data returned!') return value assert self.data_sha256 == sha256(value).hexdigest(), \ "Returned data doesn't match stored hash!" return value
def __init__(self, message_string): self.message_string = message_string self.FKmsg = mime.from_string(self.message_string)
def dosync(self): print "You\'ve Got Mail." did_except = True while did_except: try: _, data = self.mail.search(None, "ALL") did_except = False except: # Attempt reconnect did_except = True print "Disconnected, attempting reconnect." self.mail = imaplib2.IMAP4_SSL(IMAP_SERVER) self.mail.login(MAIL_USER, MAIL_PASSWORD) self.mail.select("inbox", readonly=True) ids = data[0] id_list = ids.split() new_mail_ids = [] if id_list[-1] < self.last_id: new_mail_ids = [] else: for i in xrange(len(id_list) - 1, 0, -1): if id_list[i] == self.last_id: break else: new_mail_ids.append(id_list[i]) self.last_id = id_list[-1] for mail_id in new_mail_ids: _, data = self.mail.fetch(mail_id, "(RFC822)") raw_email = "null" for d in data: if type(d) is tuple: if "RFC822" in d[0]: raw_email = d[1] if raw_email == "null": continue email_message = email.message_from_string(raw_email) flanker_msg = mime.from_string(raw_email) body = "null" try: for part in flanker_msg.parts: pp = part.body.encode('ascii', 'ignore') if start_trigger(pp, TRIGGERS): body = pp break except Exception as _: pass # If body is still null, just look for this stuff if body == "null": for l in raw_email.split('\n'): if start_trigger(l, TRIGGERS): body = l # CR-LF ugh body = body.replace('\r', '') tos = email_message.get_all('to', []) ccs = email_message.get_all('cc', []) all_recipients = getaddresses(tos + ccs) + [ parseaddr(email_message["Reply-To"] or email_message["From"]) ] reply_object = { 'subject': email_message["Subject"], 'all_recipients': all_recipients, 'raw_email': raw_email, 'msg_id': email_message["Message-ID"] } if "In-Reply-To" in email_message: reply_object["reply_to"] = email_message["In-Reply-To"] trigger = start_trigger(body, TRIGGERS) if trigger and "From" in email_message and is_whitelisted( raw_email): print "Request from {} for subject {}.".format( email_message["From"], email_message["Subject"]) # Extra parsing since our trigger word can include spaces due to gmail autocomplete body = body.replace(trigger, '') argv = [x.strip() for x in body.split()] argv = [trigger] + argv callbacks.triggered_email(body, argv, reply_object) else: callbacks.raw_email(flanker_msg, raw_email, reply_object)
def eml_to_list(my_eml): with open(my_eml, 'rb') as fhdl: raw_email = fhdl.read() msg = mime.from_string(raw_email) for part in msg.parts: container = [] container2 = [] if (part.content_type == "text/html"): soup = BeautifulSoup(part.body, "html.parser") res = soup.table.table x = res.find_all("tr") print(type(x)) for item in x: q = item.text.replace(" ", "").replace('\u3000', "").replace('▍', "").splitlines() container.append(q) v = list(filter(lambda x: x, container)) v.pop() resumes = [] split_resumes = [] coll = [] for i in v: container2.append(i[0]) container2.pop(0) container2.pop(0) container2.pop() result = ','.join(container2) info = result.split("最後修改") for s in info: if (len(s) < 100): info.remove(s) for i in info: resumes.append(i.split("專長")[0]) for x in resumes: record = x.split(",") record.pop(0) record.pop(0) split_resumes.append(record) mytest = list(filter(lambda y: y, split_resumes)) for sub_list in mytest: d = {} if (len(sub_list[-1]) <= 5): sub_list.pop() for count, record in enumerate(sub_list): if "代碼" in record: d["姓名"] = record.split("代碼")[0] if ("男" in record) or ("女" in record): d["性別"] = record.split("|")[0] d["年齡"] = record.split("|")[1] if "聯絡電話" in record: d["聯絡電話"] = record.split("聯絡電話")[1] if "電子郵件" in record: d["電子郵件"] = record.split("電子郵件")[1] if "聯絡地址" in record: d["聯絡地址"] = record.split("聯絡地址")[1] if "教育程度" in record: d["教育程度"] = record.split("教育程度")[1] if "職務類別" in record: d["求職類別"] = record.split("職務類別")[1] if "工作經驗累計年資" in record: d["累計年資"] = record.split("工作經驗累計年資")[1] if "累計經驗" in record: d["累計經驗"] = record.split("累計經驗")[1] d["過往公司"] = ",".join(sub_list[count + 1:]) coll.append(d) return coll
emails_path = os.getenv('EMAILS_PATH', "/var/mailgunflanker") elasticsearch_url = os.getenv('ELASTICSEARCH', "http://elasticsearch:9200") email_index = os.getenv('ELASTICSEARCH_INDEX', "mailgunflanker") from elasticsearch import Elasticsearch es = Elasticsearch([elasticsearch_url]) es.indices.create(index=email_index, ignore=400) id = 0 for root, dirs, files in os.walk(emails_path): for file in files: id = id + 1 filefullname = os.path.join(root, file) message_string = open(filefullname, "rb").read() msg = mime.from_string(message_string) # add email to elasticsearch doc = {} doc['subject'] = msg.clean_subject doc['headers'] = [[header, str(value)] for header, value in msg.headers.items()] if (msg.content_type.is_multipart()): doc["parts"] = [] doc["attachments"] = [] for part in msg.parts: if (part.content_type and str(part.content_type).startswith("text/")): doc["parts"].append({ "body": part.body,
def create_message(db_session, log, account, mid, folder_name, received_date, flags, body_string, created): """ Parses message data and writes out db metadata and MIME blocks. Returns the new Message, which links to the new Block objects through relationships. All new objects are uncommitted. Threads are not computed here; you gotta do that separately. Parameters ---------- mid : int The account backend-specific message identifier; it's only used for logging errors. raw_message : str The full message including headers (encoded). """ # trickle-down bugs assert account is not None and account.namespace is not None assert not isinstance(body_string, unicode) try: parsed = mime.from_string(body_string) mime_version = parsed.headers.get('Mime-Version') # NOTE: sometimes MIME-Version is set to "1.0 (1.0)", hence the # .startswith if mime_version is not None and not mime_version.startswith('1.0'): log.error('Unexpected MIME-Version: {0}'.format(mime_version)) new_msg = SpoolMessage() if created else Message() new_msg.data_sha256 = sha256(body_string).hexdigest() # clean_subject strips re:, fwd: etc. new_msg.subject = parsed.clean_subject new_msg.from_addr = parse_email_address_list( parsed.headers.get('From')) new_msg.sender_addr = parse_email_address_list( parsed.headers.get('Sender')) new_msg.reply_to = parse_email_address_list( parsed.headers.get('Reply-To')) new_msg.to_addr = parse_email_address_list(parsed.headers.getall('To')) new_msg.cc_addr = parse_email_address_list(parsed.headers.getall('Cc')) new_msg.bcc_addr = parse_email_address_list( parsed.headers.getall('Bcc')) new_msg.in_reply_to = parsed.headers.get('In-Reply-To') new_msg.message_id_header = parsed.headers.get('Message-Id') new_msg.received_date = received_date # Optional mailing list headers new_msg.mailing_list_headers = parse_ml_headers(parsed.headers) # Custom Inbox header new_msg.inbox_uid = parsed.headers.get('X-INBOX-ID') # In accordance with JWZ (http://www.jwz.org/doc/threading.html) new_msg.references = parse_references( parsed.headers.get('References', ''), parsed.headers.get('In-Reply-To', '')) new_msg.size = len(body_string) # includes headers text i = 0 # for walk_index # Store all message headers as object with index 0 headers_part = Part() headers_part.namespace_id = account.namespace.id headers_part.message = new_msg headers_part.walk_index = i headers_part.data = json.dumps(parsed.headers.items()) new_msg.parts.append(headers_part) for mimepart in parsed.walk( with_self=parsed.content_type.is_singlepart()): i += 1 if mimepart.content_type.is_multipart(): log.warning("multipart sub-part found! on {}" .format(new_msg.g_msgid)) continue # TODO should we store relations? new_part = Part() new_part.namespace_id = account.namespace.id new_part.message = new_msg new_part.walk_index = i new_part.misc_keyval = mimepart.headers.items() # everything new_part.content_type = mimepart.content_type.value new_part.filename = trim_filename( mimepart.content_type.params.get('name'), log=log) # TODO maybe also trim other headers? if mimepart.content_disposition[0] is not None: value, params = mimepart.content_disposition if value not in ['inline', 'attachment']: errmsg = """ Unknown Content-Disposition on message {0} found in {1}. Bad Content-Disposition was: '{2}' Parsed Content-Disposition was: '{3}'""".format( mid, folder_name, mimepart.content_disposition) log.error(errmsg) continue else: new_part.content_disposition = value if value == 'attachment': new_part.filename = trim_filename( params.get('filename'), log=log) if mimepart.body is None: data_to_write = '' elif new_part.content_type.startswith('text'): data_to_write = mimepart.body.encode('utf-8', 'strict') # normalize mac/win/unix newlines data_to_write = data_to_write \ .replace('\r\n', '\n').replace('\r', '\n') else: data_to_write = mimepart.body if data_to_write is None: data_to_write = '' new_part.content_id = mimepart.headers.get('Content-Id') new_part.data = data_to_write new_msg.parts.append(new_part) except mime.DecodingError: # occasionally iconv will fail via maximum recursion depth log_decode_error(account.id, folder_name, mid, body_string) log.error('DecodeError, msg logged to {0}'.format( get_errfilename(account.id, folder_name, mid))) return except RuntimeError: log_decode_error(account.id, folder_name, mid, body_string) log.error('RuntimeError<iconv> msg logged to {0}'.format( get_errfilename(account.id, folder_name, mid))) return new_msg.calculate_sanitized_body() return new_msg
from flanker import mime # Email from the sync dump exported to the 'test' db with open('tests/data/messages/replyto_message.txt', 'r') as f: message = f.read() parsed = mime.from_string(message) message_id = parsed.headers.get('Message-ID') references = parsed.headers.get('References') TEST_MSG = {'message-id': message_id, 'references': references}
def __init__(self, account=None, mid=None, folder_name=None, received_date=None, flags=None, body_string=None, *args, **kwargs): """ Parses message data and writes out db metadata and MIME blocks. Returns the new Message, which links to the new Block objects through relationships. All new objects are uncommitted. Threads are not computed here; you gotta do that separately. Parameters ---------- mid : int The account backend-specific message identifier; it's only used for logging errors. raw_message : str The full message including headers (encoded). """ _rqd = [account, mid, folder_name, flags, body_string] MailSyncBase.__init__(self, *args, **kwargs) # for drafts if not any(_rqd): return if any(_rqd) and not all([v is not None for v in _rqd]): raise ValueError( "Required keyword arguments: account, mid, folder_name, " "flags, body_string") # stop trickle-down bugs assert account.namespace is not None assert not isinstance(body_string, unicode) try: parsed = mime.from_string(body_string) mime_version = parsed.headers.get('Mime-Version') # sometimes MIME-Version is "1.0 (1.0)", hence the .startswith() if mime_version is not None and not mime_version.startswith('1.0'): log.warning('Unexpected MIME-Version', account_id=account.id, folder_name=folder_name, mid=mid, mime_version=mime_version) self.data_sha256 = sha256(body_string).hexdigest() # clean_subject strips re:, fwd: etc. self.subject = parsed.clean_subject self.from_addr = parse_email_address_list( parsed.headers.get('From')) self.sender_addr = parse_email_address_list( parsed.headers.get('Sender')) self.reply_to = parse_email_address_list( parsed.headers.get('Reply-To')) self.to_addr = parse_email_address_list( parsed.headers.getall('To')) self.cc_addr = parse_email_address_list( parsed.headers.getall('Cc')) self.bcc_addr = parse_email_address_list( parsed.headers.getall('Bcc')) self.in_reply_to = parsed.headers.get('In-Reply-To') self.message_id_header = parsed.headers.get('Message-Id') self.received_date = received_date if received_date else \ get_internaldate(parsed.headers.get('Date'), parsed.headers.get('Received')) # Custom Inbox header self.inbox_uid = parsed.headers.get('X-INBOX-ID') # In accordance with JWZ (http://www.jwz.org/doc/threading.html) self.references = parse_references( parsed.headers.get('References', ''), parsed.headers.get('In-Reply-To', '')) self.size = len(body_string) # includes headers text i = 0 # for walk_index from inbox.models.block import Part # Store all message headers as object with index 0 headers_part = Part() headers_part.namespace_id = account.namespace.id headers_part.message = self headers_part.walk_index = i headers_part.data = json.dumps(parsed.headers.items()) self.parts.append(headers_part) for mimepart in parsed.walk( with_self=parsed.content_type.is_singlepart()): i += 1 if mimepart.content_type.is_multipart(): log.warning('multipart sub-part found', account_id=account.id, folder_name=folder_name, mid=mid) continue # TODO should we store relations? new_part = Part() new_part.namespace_id = account.namespace.id new_part.message = self new_part.walk_index = i new_part.content_type = mimepart.content_type.value new_part.filename = _trim_filename( mimepart.content_type.params.get('name'), account.id, mid) # TODO maybe also trim other headers? if mimepart.content_disposition[0] is not None: value, params = mimepart.content_disposition if value not in ['inline', 'attachment']: log.error('Unknown Content-Disposition', account_id=account.id, mid=mid, folder_name=folder_name, bad_content_disposition= mimepart.content_disposition, parsed_content_disposition=value) continue else: new_part.content_disposition = value if value == 'attachment': new_part.filename = _trim_filename( params.get('filename'), account.id, mid) if mimepart.body is None: data_to_write = '' elif new_part.content_type.startswith('text'): data_to_write = mimepart.body.encode('utf-8', 'strict') # normalize mac/win/unix newlines data_to_write = data_to_write \ .replace('\r\n', '\n').replace('\r', '\n') else: data_to_write = mimepart.body if data_to_write is None: data_to_write = '' new_part.content_id = mimepart.headers.get('Content-Id') new_part.data = data_to_write self.parts.append(new_part) self.calculate_sanitized_body() except mime.DecodingError: # Occasionally iconv will fail via maximum recursion depth. We # still keep the metadata and mark it as b0rked. _log_decode_error(account.id, folder_name, mid, body_string) log.error('Message parsing DecodeError', account_id=account.id, folder_name=folder_name, err_filename=_get_errfilename( account.id, folder_name, mid)) self.mark_error() return except AttributeError: # For EAS messages that are missing Date + Received headers, due # to the processing we do in inbox.util.misc.get_internaldate() _log_decode_error(account.id, folder_name, mid, body_string) log.error('Message parsing AttributeError', account_id=account.id, folder_name=folder_name, err_filename=_get_errfilename( account.id, folder_name, mid)) self.mark_error() return except RuntimeError: _log_decode_error(account.id, folder_name, mid, body_string) log.error('Message parsing RuntimeError<iconv>'.format( err_filename=_get_errfilename(account.id, folder_name, mid))) self.mark_error() return
def dosync(self): print "You\'ve Got Mail." did_except = True while did_except: try: result, data = self.mail.search(None, "ALL") did_except = False except: # Attempt reconnect did_except = True print "Disconnected, attempting reconnect." self.mail = imaplib2.IMAP4_SSL(IMAP_SERVER) self.mail.login(MAIL_USER, MAIL_PASSWORD) self.mail.select("inbox", readonly=True) ids = data[0] id_list = ids.split() new_mail_ids = [] if id_list[-1] < self.last_id: new_mail_ids = [] else: for i in xrange(len(id_list) - 1, 0, -1): if id_list[i] == self.last_id: break else: new_mail_ids.append(id_list[i]) self.last_id = id_list[-1] for mail_id in new_mail_ids: result, data = self.mail.fetch(mail_id, "(RFC822)") # print data raw_email = "null" for d in data: if type(d) is tuple: if "RFC822" in d[0]: raw_email = d[1] if raw_email == "null": continue email_message = email.message_from_string(raw_email) flanker_msg = mime.from_string(raw_email) body = "null" try: for part in flanker_msg.parts: if part.body.encode('ascii', 'ignore').startswith(TRIGGER): body = part.body.encode('ascii', 'ignore') break except Exception as e: pass # If body is still null, just look for this stuff if body == "null": for l in raw_email.split('\n'): if l.startswith(TRIGGER): body = l # CR-LF ugh body = body.replace('\r', '') COMMANDS = load_commands() if body.startswith(TRIGGER) and "From" in email_message: if len(body.split(' ')) >= 2: command = body.split(' ')[1].strip() # Ugly custom rule if command.startswith('edu'): command = "edu" if command.startswith('mixed'): command = "mixed" # Hacky for c in COMMANDS.keys(): if command.startswith(c): command = c break else: command = "faq" print "Request from {} for subject {} with command {}.".format( email_message["From"], email_message["Subject"], command) tos = email_message.get_all('to', []) ccs = email_message.get_all('cc', []) all_recipients = getaddresses(tos + ccs) + [ parseaddr(email_message["Reply-To"] or email_message["From"]) ] if command.startswith('template'): lines = body.strip().split('\n') new_command = lines[0].split()[2] content = '<br>\n'.join(lines[1:]) print 'Request from {} for new command {} with body:'.format( email_message["From"], new_command) print content COMMANDS[new_command] = content save_commands(COMMANDS) return if command.startswith('whitelist'): # Compute the whitelist email wl_email = None for line in body.split('\n'): if line.startswith(TRIGGER): tokens = line.split(' ') if len(tokens) >= 3: wl_email = tokens[2] if not wl_email: return print "Whitelist Email:", wl_email # Post to quill quill.post_wl(quill.get_wl() + [wl_email]) content = COMMANDS['whitelist'].format( email=wl_email) + FOOTER else: if command not in COMMANDS: return content = COMMANDS[command] + FOOTER reply_sujet = "Re: " + email_message[ "Subject"] if not email_message['Subject'].startswith( 'Re:') else email_message["Subject"] recipients = [] for r in all_recipients: recipients.append(r[1]) # Try to find the initial sender recipients += email_finder.get_emails(raw_email) # Remove dupes recipients = list(set(recipients)) print recipients msg = MIMEText(content, 'html') msg['Subject'] = reply_sujet msg["Message-ID"] = email.utils.make_msgid() msg["In-Reply-To"] = email_message["Message-ID"] msg["References"] = email_message["Message-ID"] msg["To"] = ", ".join(recipients) msg["From"] = MAIL_FROM s = smtplib.SMTP_SSL(SMTP_SERVER) s.login(SEND_MAIL_USER, SEND_MAIL_PASSWORD) s.sendmail(MAIL_FROM, recipients, msg.as_string()) s.quit()
""" import sys from flanker import mime from sender import Message from sender import Mail import time #SETTINGS smtp_hostname = "" smtp_port = 25 smtp_username = "" smtp_password = "" smtp_security = "SSL" #END SETTINGS input = sys.stdin.read() text = mime.from_string(input) msg = Message(text.headers['subject']) msg.fromaddr = text.headers['from'] msg.to = text.headers['to'] msg.body = text.body msg.date = time.time() msg.charset = "utf-8" #Check SSL or TLS smtp_ssl_use = False smtp_tls_use = False if smtp_security == "SSL": smtp_ssl_use = True elif smtp_security == "TLS":
logging.basicConfig(level=logging.DEBUG) if not path.exists(args.privatekeyfile): sys.exit("Private key file not found.") if not path.exists(args.messagefile): sys.exit("Message file not found.") if sys.version_info[0] >= 3: args.selector = bytes(args.selector, encoding=UTF8_ENCODING) args.domain = bytes(args.domain, encoding=UTF8_ENCODING) args.headers = bytes(args.headers, encoding=UTF8_ENCODING) # read file contents message = bytes(open(args.messagefile, 'rb').read()) private_key = bytes(open(args.privatekeyfile, 'rb').read()) arc_headers_present = False mime = mime.from_string(message) if len(mime.headers.getall('ARC-Seal')) > 0: arc_headers_present = True authres_header = get_authres_header(args.srvid, arc_headers_present) message_with_authres = bytes(authres_header, encoding=UTF8_ENCODING) + message logging.debug("Message with authres: %s", message_with_authres) signature = sign_message(message_with_authres, args.selector, args.domain, private_key, args.headers.split(b':'), 'ARC', bytes(args.srvid, encoding=UTF8_ENCODING)) if len(signature) == 0: sys.exit("Unable to generate arc headers") separator = "#####" signature[0] = signature[0].decode(UTF8_ENCODING).replace( "ARC-Seal: ", "ARC-Seal" + separator)
def collect_data(): """Messy code to download training data. """ c = load_config('templates') templates = c['templates'] training_data = [] mail = imaplib2.IMAP4_SSL(IMAP_SERVER) mail.login(MAIL_USER, MAIL_PASSWORD) mail.select("[Gmail]/All Mail", readonly=True) result, data = mail.search(None, '(BODY "%s")' % ("@faqbot")) ids = data[0] id_list = ids.split() for idx, r_id in enumerate(id_list): _, data = mail.fetch(r_id, "(RFC822)") print "%i / %i (%i%%)" % (idx, len(id_list), int(float(idx) / len(id_list) * 100)) raw_email = "null" for d in data: if type(d) is tuple: if "RFC822" in d[0]: raw_email = d[1] flanker_msg = mime.from_string(raw_email) body = "null" try: for part in flanker_msg.parts: if str(part) == "(text/plain)": pp = part.body.encode('ascii', 'ignore') body = pp except Exception as _: pass if body == "null": continue parsed_body = EmailReplyParser.read(body) if len(parsed_body.fragments) >= 2: if parsed_body.fragments[0].content.split()[0] == "@faqbot": fb = parsed_body.fragments[0].content.split()[1] original = parsed_body.fragments[1].content lines = [] for l in original.split('\n'): if l.startswith('> '): tl = l.replace('>', '').strip() if tl != '' and not (tl.startswith('On')): lines.append(l.replace('>', '')) key = fb original = '\n'.join(lines) # Now that we have this, let's make sure it's # valid and stuff and then save it. if key in templates: training_data.append((key, original)) save_config(training_data, 'smartreply_data')
opt_parser.add_argument('--format', dest="format", type=str, help='The output format: json, msgpack, debug', default="debug") args = opt_parser.parse_args() if not args.file: print("An argument is required (the name of the file)") sys.exit(1) file_path = args.file if not os.path.isfile(file_path): print("The file specified does not exist") sys.exit(1) with open(file_path, 'r') as f: file_contents = f.read() mimepart = mime.from_string(file_contents) parser = Parser() msg = parser.message_from_mimepart(mimepart) if args.format == "json": print message_to_json(msg) elif args.format == "msgpack": print message_to_msgpack(msg) else: message_to_debug_out(msg)
return results, indicators if __name__ == '__main__': argParser = argparse.ArgumentParser(description='email_abuse parser') argParser.add_argument( '-r', default='-', help='Filename of the raw email to read (default: stdin)') argParser.add_argument( '-o', default='ascii', help='Output format: ascii or json (default: ascii)') args = argParser.parse_args() if args.r == '-': msg = mime.from_string(sys.stdin.read()) else: fp = open(args.r, 'rb') msg = mime.from_string(fp.read()) msg_file = init(msg) subject = msg.subject passwordlist = [ "password", "passw0rd", "infected", "qwerty", "malicious", "archive", "zip" ] indicators = 0 examine_headers = ExamineHeaders(msg) origin_ip, rbl_listed, rbl_comment, mailfrom, mailto, origin_domain = examine_headers.processing(
def remove_code(msg) : msg = (remove_content_in_braces(msg)) msg = (remove_func_and_struct(msg)) msg = (remove_other_code_lines(msg)) return msg # remove_code(msg) rt ='' fpath = "a1/*.email" files = glob.glob(fpath) for file in files : f = open(file, "r") msg = f.read() msg = mime.from_string(msg) if msg.content_type.is_singlepart(): temp = str(msg.body) temp = temp.splitlines() for _ in temp: if _.startswith('>'): continue elif _.startswith('On'): continue else: rt+=_+"\n" else : for part in msg.parts : if "(text/plain)" in str(part) : temp = str(part.body) temp = temp.splitlines()
def create_from_synced(cls, account, mid, folder_name, received_date, body_string): """ Parses message data and writes out db metadata and MIME blocks. Returns the new Message, which links to the new Part and Block objects through relationships. All new objects are uncommitted. Threads are not computed here; you gotta do that separately. Parameters ---------- mid : int The account backend-specific message identifier; it's only used for logging errors. raw_message : str The full message including headers (encoded). """ _rqd = [account, mid, folder_name, body_string] if not all([v is not None for v in _rqd]): raise ValueError( "Required keyword arguments: account, mid, folder_name, " "body_string") # stop trickle-down bugs assert account.namespace is not None assert not isinstance(body_string, unicode) msg = Message() try: msg.namespace_id = account.namespace.id parsed = mime.from_string(body_string) mime_version = parsed.headers.get('Mime-Version') # sometimes MIME-Version is "1.0 (1.0)", hence the .startswith() if mime_version is not None and not mime_version.startswith('1.0'): log.warning('Unexpected MIME-Version', account_id=account.id, folder_name=folder_name, mid=mid, mime_version=mime_version) msg.data_sha256 = sha256(body_string).hexdigest() # clean_subject strips re:, fwd: etc. msg.subject = parsed.clean_subject msg.from_addr = parse_mimepart_address_header(parsed, 'From') msg.sender_addr = parse_mimepart_address_header(parsed, 'Sender') msg.reply_to = parse_mimepart_address_header(parsed, 'Reply-To') msg.to_addr = parse_mimepart_address_header(parsed, 'To') msg.cc_addr = parse_mimepart_address_header(parsed, 'Cc') msg.bcc_addr = parse_mimepart_address_header(parsed, 'Bcc') msg.in_reply_to = parsed.headers.get('In-Reply-To') msg.message_id_header = parsed.headers.get('Message-Id') msg.received_date = received_date if received_date else \ get_internaldate(parsed.headers.get('Date'), parsed.headers.get('Received')) # Custom Inbox header msg.inbox_uid = parsed.headers.get('X-INBOX-ID') # In accordance with JWZ (http://www.jwz.org/doc/threading.html) msg.references = parse_references( parsed.headers.get('References', ''), parsed.headers.get('In-Reply-To', '')) msg.size = len(body_string) # includes headers text i = 0 # for walk_index from inbox.models.block import Block, Part # Store all message headers as object with index 0 block = Block() block.namespace_id = account.namespace.id block.data = json.dumps(parsed.headers.items()) headers_part = Part(block=block, message=msg) headers_part.walk_index = i msg.parts.append(headers_part) for mimepart in parsed.walk( with_self=parsed.content_type.is_singlepart()): i += 1 if mimepart.content_type.is_multipart(): log.warning('multipart sub-part found', account_id=account.id, folder_name=folder_name, mid=mid) continue # TODO should we store relations? msg._parse_mimepart(mimepart, mid, i, account.namespace.id) msg.calculate_sanitized_body() except (mime.DecodingError, AttributeError, RuntimeError) as e: # Message parsing can fail for several reasons. Occasionally iconv # will fail via maximum recursion depth. EAS messages may be # missing Date and Received headers. In such cases, we still keep # the metadata and mark it as b0rked. _log_decode_error(account.id, folder_name, mid, body_string) err_filename = _get_errfilename(account.id, folder_name, mid) log.error('Message parsing error', folder_name=folder_name, account_id=account.id, err_filename=err_filename, error=e) msg._mark_error() # Occasionally people try to send messages to way too many # recipients. In such cases, empty the field and treat as a parsing # error so that we don't break the entire sync. for field in ('to_addr', 'cc_addr', 'bcc_addr', 'references'): value = getattr(msg, field) if json_field_too_long(value): _log_decode_error(account.id, folder_name, mid, body_string) err_filename = _get_errfilename(account.id, folder_name, mid) log.error('Recipient field too long', field=field, account_id=account.id, folder_name=folder_name, mid=mid) setattr(msg, field, []) msg._mark_error() return msg
from flanker import mime fpath = "3.email" f = open(fpath, 'r') mailmsg = f.read() msg = mime.from_string(mailmsg) # print (msg.headers.items()) print("printing email message!!") if msg.content_type.is_singlepart(): temp = str(msg.body) temp = temp.splitlines() for _ in temp: if _.startswith('>'): continue else: print("*** " + _) print("********************************") elif msg.content_type.is_multipart(): for part in msg.parts: if "(text/plain)" in str(part): temp = str(part.body) temp = temp.splitlines() for _ in temp: if _.startswith('>'): continue else: print("*** " + _) print("********************************")