def extract_body(message: message.Message, include_quotes: bool = False, prefer_text: bool = True) -> str: plaintext_content = extract_plaintext_body(message, include_quotes) html_content = extract_html_body(message, include_quotes) if plaintext_content is None and html_content is None: logging.warning("Content types: %s" % ([part.get_content_type() for part in message.walk()], )) raise ZulipEmailForwardUserError( "Unable to find plaintext or HTML message body") if not plaintext_content and not html_content: raise ZulipEmailForwardUserError( "Email has no nonempty body sections; ignoring.") if prefer_text: if plaintext_content: return plaintext_content else: assert html_content # Needed for mypy. Ensured by the validating block above. return html_content else: if html_content: return html_content else: assert plaintext_content # Needed for mypy. Ensured by the validating block above. return plaintext_content
def extract_body(message: message.Message, include_quotations: bool=False) -> str: import talon global talon_initialized if not talon_initialized: talon.init() talon_initialized = True # If the message contains a plaintext version of the body, use # that. plaintext_content = get_message_part_by_type(message, "text/plain") if plaintext_content: if include_quotations: return plaintext_content else: return talon.quotations.extract_from_plain(plaintext_content) # If we only have an HTML version, try to make that look nice. html_content = get_message_part_by_type(message, "text/html") if html_content: if include_quotations: return convert_html_to_markdown(html_content) else: return convert_html_to_markdown(talon.quotations.extract_from_html(html_content)) if plaintext_content is not None or html_content is not None: raise ZulipEmailForwardUserError("Email has no nonempty body sections; ignoring.") logging.warning("Content types: %s" % ([part.get_content_type() for part in message.walk()],)) raise ZulipEmailForwardUserError("Unable to find plaintext or HTML message body")
def extract_and_upload_attachments(message: message.Message, realm: Realm) -> str: user_profile = get_system_bot(settings.EMAIL_GATEWAY_BOT) attachment_links = [] for part in message.walk(): content_type = part.get_content_type() encoded_filename = part.get_filename() if not encoded_filename: continue filename = handle_header_content(encoded_filename) if filename: attachment = part.get_payload(decode=True) if isinstance(attachment, bytes): s3_url = upload_message_file(filename, len(attachment), content_type, attachment, user_profile, target_realm=realm) formatted_link = "[%s](%s)" % (filename, s3_url) attachment_links.append(formatted_link) else: logger.warning( "Payload is not bytes (invalid attachment %s in message from %s)." % (filename, message.get("From"))) return '\n'.join(attachment_links)
def obtainImageInfosFromMessage(message): links = None #Get HTML part of message htmlPart = None for part in message.walk(): if part.get_content_type()=="text/html": htmlPart = part if htmlPart is not None: htmlStr = htmlPart.as_string() htmlStr = htmlStr.replace("\r", "") htmlStr = htmlStr.replace("\n", "") soup = bs4.BeautifulSoup(htmlStr) imgs = soup.find_all("img") img_urls = [(a.get("src"), a.get("alt")) for a in imgs] image_info = [l for l in img_urls if l[0].find(".jpg")!=-1] links = image_info return links
def extract_body(message: message.Message, remove_quotations: bool=True) -> str: import talon global talon_initialized if not talon_initialized: talon.init() talon_initialized = True # If the message contains a plaintext version of the body, use # that. plaintext_content = get_message_part_by_type(message, "text/plain") if plaintext_content: if remove_quotations: return talon.quotations.extract_from_plain(plaintext_content) else: return plaintext_content # If we only have an HTML version, try to make that look nice. html_content = get_message_part_by_type(message, "text/html") if html_content: if remove_quotations: return convert_html_to_markdown(talon.quotations.extract_from_html(html_content)) else: return convert_html_to_markdown(html_content) if plaintext_content is not None or html_content is not None: raise ZulipEmailForwardUserError("Email has no nonempty body sections; ignoring.") logging.warning("Content types: %s" % ([part.get_content_type() for part in message.walk()])) raise ZulipEmailForwardUserError("Unable to find plaintext or HTML message body")
def _get_ics_part(self, message): ics_part = None for part in message.walk(): if part.get_content_type() == 'text/calendar': ics_part = part return ics_part
def get_message_part_by_type(message: message.Message, content_type: str) -> Optional[str]: charsets = message.get_charsets() for idx, part in enumerate(message.walk()): if part.get_content_type() == content_type: content = part.get_payload(decode=True) assert isinstance(content, bytes) if charsets[idx]: return content.decode(charsets[idx], errors="ignore") return None
def get_message_part_by_type(message, content_type): # type: (message.Message, text_type) -> text_type charsets = message.get_charsets() for idx, part in enumerate(message.walk()): if part.get_content_type() == content_type: content = part.get_payload(decode=True) if charsets[idx]: content = content.decode(charsets[idx], errors="ignore") return content
def get_message_part_by_type(message, content_type): # type: (message.Message, text_type) -> text_type charsets = message.get_charsets() for idx, part in enumerate(message.walk()): if part.get_content_type() == content_type: content = part.get_payload(decode=True) assert isinstance(content, binary_type) if charsets[idx]: text = content.decode(charsets[idx], errors="ignore") return text
def get_message_part_by_type(message, content_type): # type: (message.Message, Text) -> Text charsets = message.get_charsets() for idx, part in enumerate(message.walk()): if part.get_content_type() == content_type: content = part.get_payload(decode=True) assert isinstance(content, binary_type) if charsets[idx]: text = content.decode(charsets[idx], errors="ignore") return text
def get_message_part_by_type(message: message.Message, content_type: str) -> Optional[str]: charsets = message.get_charsets() for idx, part in enumerate(message.walk()): if part.get_content_type() == content_type: content = part.get_payload(decode=True) assert isinstance(content, bytes) if charsets[idx]: return content.decode(charsets[idx], errors="ignore") # If no charset has been specified in the header, assume us-ascii, # by RFC6657: https://tools.ietf.org/html/rfc6657 else: return content.decode("us-ascii", errors="ignore") return None
def test_attachment_image_is_stored(self): message = self.get_message_html_plain('Text Alternative2', 'hypertext2', 'plaintext2') content = b'image-data' message.add_attachment(content, maintype='image', subtype='png') for part in message.walk(): if part.get_content_type() == 'image/png': part.replace_header('Content-Disposition', 'attachment; filename="foo.baz.png"') self.inject_mail(self.gestalt.user.email, [self.group_address], data=message.as_bytes()) contribution = self.assertExists(models.Contribution, conversation__subject='Text Alternative2', text__text='hypertext2') self.assertEqual(contribution.attachments.count(), 1) file_obj = contribution.attachments.first().file.first() self.assertEqual(file_obj.file.size, len(content)) # the check is only based on the content disposition (filename) self.assertTrue(file_obj.is_image()) short_filename = os.path.basename(file_obj.file.path) self.assertTrue(short_filename.startswith('foo.baz-'), file_obj.file.path) self.assertTrue(short_filename.endswith('.png'), file_obj.file.path)
def parseFile(message): """ :param message: 结构化邮件 :return parsefile: 附件内容(为一字典) """ parsefile = {} parsefile['filename'] = [] parsefile['filedata'] = [] for msg in message.walk(): #获取附件名 filename = msg.get_filename() if filename: #解析附件名 header = email.header.Header(filename) dh = email.header.decode_header(header) filename = dh[0][0] if filename.find(b'=?') != -1 and filename.find(b'?=') != -1: try: fsplit = filename.split(b'?') fcode = fsplit[1] fname = fsplit[-2] filename = base64.decodebytes(fname).decode(fcode.decode()) except: pass if dh[0][-1]: try: filename = filename.decode(dh[0][-1]) except: pass #获取附件数据 filedata = msg.get_payload(decode=True) fns = filename.split('.') if fns[-1] == fns[-2]: filename = '' fns.pop(-1) for n in range(0, len(fns)): filename += fns[n] if n < len(fns) - 1: filename += '.' #添加附件名和附件数据到附件字典 parsefile['filename'].append(filename) parsefile['filedata'].append(filedata) return parsefile
def parse(data, codec=Codec): # noqa: C901 message = email.message_from_bytes(data) # ################################################################## HEADERS _, issuer = email.utils.parseaddr(message.get("from")) subject = utils_parse.normalize(message.get("subject")) recipient, identifier, domain = get_address_parts(message.get("to")) new_tag, subject = utils_parse.consume_re(codec.re_new(), subject) new = bool(new_tag) external_id, subject = utils_parse.consume_re(codec.re_external_id(), subject) if not identifier and "in-reply-to" in message: irt = get_address_parts(message.get("in-reply-to")) if irt[0] == recipient and irt[2] == domain: if irt[1] == "new": new_tag = True else: identifier = irt[1] identifier = irt[1] if not identifier: identifier, subject = utils_parse.consume_re(codec.re_id(), subject) # ##################################################################### BODY json_part, text_part, html_part, attachments = "", "", "", [] for part in message.walk(): # sub-parts are iterated over in this walk if part.is_multipart(): continue payload = part.get_payload(decode=True) if geojson and part.get_content_type() == "application/geo+json": try: attachments.append((part.get_content_type(), geojson.loads(payload.decode("utf-8")))) except ValueError: pass elif part.get_content_type() == "application/json": try: json_part = json.loads(payload.decode("utf-8")) except ValueError: json_part = None elif part.get_content_maintype() == "text": payload = payload.decode(part.get_content_charset() or "utf-8") # if multiple text/plain parts are given, concatenate if part.get_content_subtype() == "plain": text_part += payload # if no text/plain version is given, # get plain text version from HTML using BeautifulSoup if part.get_content_subtype() == "html" and not text_part: soup = bs4.BeautifulSoup(payload, "html.parser") for item in soup(["script", "style"]): item.extract() html_part += "\n" + "\n".join( l.strip() for l in soup.get_text().split("\n") if l) # other subtypes of txt are considered attachments if part.get_content_subtype() not in ("plain", "html"): attachments.append((part.get_content_type(), payload)) # attachments elif part.get_content_maintype() in ("image", "video", "application"): attachments.append((part.get_content_type(), payload)) # ################################################################# FIX TEXT text_part = (text_part or html_part).strip() # remove signature match = SIGNATURE_RE.search(text_part) if match: text_part = text_part[:match.start()].strip() # ################################################################# FIX JSON # if no `application/json` part was given, parse `text/plain` as YAML # if we manage to parse YAML, remove this part from `text/plain` # otherwise, leave the text part untouched if not json_part: try: # only parse the first YAML document provided. # this enables the user to provide YAML, # use the '---' or '...' YAML document separators # and provided plain text afterwards. json_part = next(yaml.safe_load_all(text_part)) document_starts = [ e for e in yaml.parse(text_part) if isinstance(e, yaml.events.DocumentStartEvent) ][1:] if document_starts: text_part = text_part[document_starts[1].end_mark.index + 1:] else: text_part = "" except yaml.YAMLError: json_part = None return codec.update_item( domain=domain, issuer=issuer, recipient=recipient, identifier=identifier, external_id=external_id, new=new, subject=subject, json_part=json_part, text_part=text_part, attachments=attachments, )