Пример #1
0
def extract_body(message: message.Message,
                 include_quotes: bool = False,
                 prefer_text: bool = True) -> str:
    plaintext_content = extract_plaintext_body(message, include_quotes)
    html_content = extract_html_body(message, include_quotes)

    if plaintext_content is None and html_content is None:
        logging.warning("Content types: %s" %
                        ([part.get_content_type()
                          for part in message.walk()], ))
        raise ZulipEmailForwardUserError(
            "Unable to find plaintext or HTML message body")
    if not plaintext_content and not html_content:
        raise ZulipEmailForwardUserError(
            "Email has no nonempty body sections; ignoring.")

    if prefer_text:
        if plaintext_content:
            return plaintext_content
        else:
            assert html_content  # Needed for mypy. Ensured by the validating block above.
            return html_content
    else:
        if html_content:
            return html_content
        else:
            assert plaintext_content  # Needed for mypy. Ensured by the validating block above.
            return plaintext_content
Пример #2
0
def extract_body(message: message.Message, include_quotations: bool=False) -> str:
    import talon
    global talon_initialized
    if not talon_initialized:
        talon.init()
        talon_initialized = True

    # If the message contains a plaintext version of the body, use
    # that.
    plaintext_content = get_message_part_by_type(message, "text/plain")
    if plaintext_content:
        if include_quotations:
            return plaintext_content
        else:
            return talon.quotations.extract_from_plain(plaintext_content)

    # If we only have an HTML version, try to make that look nice.
    html_content = get_message_part_by_type(message, "text/html")
    if html_content:
        if include_quotations:
            return convert_html_to_markdown(html_content)
        else:
            return convert_html_to_markdown(talon.quotations.extract_from_html(html_content))

    if plaintext_content is not None or html_content is not None:
        raise ZulipEmailForwardUserError("Email has no nonempty body sections; ignoring.")

    logging.warning("Content types: %s" % ([part.get_content_type() for part in message.walk()],))
    raise ZulipEmailForwardUserError("Unable to find plaintext or HTML message body")
Пример #3
0
def extract_and_upload_attachments(message: message.Message,
                                   realm: Realm) -> str:
    user_profile = get_system_bot(settings.EMAIL_GATEWAY_BOT)

    attachment_links = []
    for part in message.walk():
        content_type = part.get_content_type()
        encoded_filename = part.get_filename()
        if not encoded_filename:
            continue

        filename = handle_header_content(encoded_filename)
        if filename:
            attachment = part.get_payload(decode=True)
            if isinstance(attachment, bytes):
                s3_url = upload_message_file(filename,
                                             len(attachment),
                                             content_type,
                                             attachment,
                                             user_profile,
                                             target_realm=realm)
                formatted_link = "[%s](%s)" % (filename, s3_url)
                attachment_links.append(formatted_link)
            else:
                logger.warning(
                    "Payload is not bytes (invalid attachment %s in message from %s)."
                    % (filename, message.get("From")))

    return '\n'.join(attachment_links)
Пример #4
0
def obtainImageInfosFromMessage(message):
	links = None

	#Get HTML part of message
	htmlPart = None
	for part in message.walk():
		if part.get_content_type()=="text/html":
			htmlPart = part

	
	if htmlPart is not None:
		htmlStr = htmlPart.as_string()
		htmlStr = htmlStr.replace("\r", "")
		htmlStr = htmlStr.replace("\n", "")

		soup = bs4.BeautifulSoup(htmlStr)

		imgs = soup.find_all("img")
		img_urls = [(a.get("src"), a.get("alt")) for a in imgs]

		image_info = [l for l in img_urls if l[0].find(".jpg")!=-1]

		links = image_info

	return links
Пример #5
0
def extract_body(message: message.Message, remove_quotations: bool=True) -> str:
    import talon
    global talon_initialized
    if not talon_initialized:
        talon.init()
        talon_initialized = True

    # If the message contains a plaintext version of the body, use
    # that.
    plaintext_content = get_message_part_by_type(message, "text/plain")
    if plaintext_content:
        if remove_quotations:
            return talon.quotations.extract_from_plain(plaintext_content)
        else:
            return plaintext_content

    # If we only have an HTML version, try to make that look nice.
    html_content = get_message_part_by_type(message, "text/html")
    if html_content:
        if remove_quotations:
            return convert_html_to_markdown(talon.quotations.extract_from_html(html_content))
        else:
            return convert_html_to_markdown(html_content)

    if plaintext_content is not None or html_content is not None:
        raise ZulipEmailForwardUserError("Email has no nonempty body sections; ignoring.")

    logging.warning("Content types: %s" % ([part.get_content_type() for part in message.walk()]))
    raise ZulipEmailForwardUserError("Unable to find plaintext or HTML message body")
Пример #6
0
    def _get_ics_part(self, message):
        ics_part = None
        for part in message.walk():
            if part.get_content_type() == 'text/calendar':
                ics_part = part

        return ics_part
    def _get_ics_part(self, message):
        ics_part = None
        for part in message.walk():
            if part.get_content_type() == 'text/calendar':
                ics_part = part

        return ics_part
Пример #8
0
def get_message_part_by_type(message: message.Message, content_type: str) -> Optional[str]:
    charsets = message.get_charsets()

    for idx, part in enumerate(message.walk()):
        if part.get_content_type() == content_type:
            content = part.get_payload(decode=True)
            assert isinstance(content, bytes)
            if charsets[idx]:
                return content.decode(charsets[idx], errors="ignore")
    return None
Пример #9
0
def get_message_part_by_type(message: message.Message, content_type: str) -> Optional[str]:
    charsets = message.get_charsets()

    for idx, part in enumerate(message.walk()):
        if part.get_content_type() == content_type:
            content = part.get_payload(decode=True)
            assert isinstance(content, bytes)
            if charsets[idx]:
                return content.decode(charsets[idx], errors="ignore")
    return None
Пример #10
0
def get_message_part_by_type(message, content_type):
    # type: (message.Message, text_type) -> text_type
    charsets = message.get_charsets()

    for idx, part in enumerate(message.walk()):
        if part.get_content_type() == content_type:
            content = part.get_payload(decode=True)
            if charsets[idx]:
                content = content.decode(charsets[idx], errors="ignore")
            return content
Пример #11
0
def get_message_part_by_type(message, content_type):
    # type: (message.Message, text_type) -> text_type
    charsets = message.get_charsets()

    for idx, part in enumerate(message.walk()):
        if part.get_content_type() == content_type:
            content = part.get_payload(decode=True)
            assert isinstance(content, binary_type)
            if charsets[idx]:
                text = content.decode(charsets[idx], errors="ignore")
            return text
Пример #12
0
def get_message_part_by_type(message, content_type):
    # type: (message.Message, Text) -> Text
    charsets = message.get_charsets()

    for idx, part in enumerate(message.walk()):
        if part.get_content_type() == content_type:
            content = part.get_payload(decode=True)
            assert isinstance(content, binary_type)
            if charsets[idx]:
                text = content.decode(charsets[idx], errors="ignore")
            return text
Пример #13
0
def get_message_part_by_type(message: message.Message, content_type: str) -> Optional[str]:
    charsets = message.get_charsets()

    for idx, part in enumerate(message.walk()):
        if part.get_content_type() == content_type:
            content = part.get_payload(decode=True)
            assert isinstance(content, bytes)
            if charsets[idx]:
                return content.decode(charsets[idx], errors="ignore")
            # If no charset has been specified in the header, assume us-ascii,
            # by RFC6657: https://tools.ietf.org/html/rfc6657
            else:
                return content.decode("us-ascii", errors="ignore")

    return None
Пример #14
0
 def test_attachment_image_is_stored(self):
     message = self.get_message_html_plain('Text Alternative2', 'hypertext2', 'plaintext2')
     content = b'image-data'
     message.add_attachment(content, maintype='image', subtype='png')
     for part in message.walk():
         if part.get_content_type() == 'image/png':
             part.replace_header('Content-Disposition', 'attachment; filename="foo.baz.png"')
     self.inject_mail(self.gestalt.user.email, [self.group_address], data=message.as_bytes())
     contribution = self.assertExists(models.Contribution,
                                      conversation__subject='Text Alternative2',
                                      text__text='hypertext2')
     self.assertEqual(contribution.attachments.count(), 1)
     file_obj = contribution.attachments.first().file.first()
     self.assertEqual(file_obj.file.size, len(content))
     # the check is only based on the content disposition (filename)
     self.assertTrue(file_obj.is_image())
     short_filename = os.path.basename(file_obj.file.path)
     self.assertTrue(short_filename.startswith('foo.baz-'), file_obj.file.path)
     self.assertTrue(short_filename.endswith('.png'), file_obj.file.path)
Пример #15
0
def parseFile(message):
    """
    :param message: 结构化邮件
    :return parsefile: 附件内容(为一字典)
    """
    parsefile = {}
    parsefile['filename'] = []
    parsefile['filedata'] = []
    for msg in message.walk():
        #获取附件名
        filename = msg.get_filename()
        if filename:
            #解析附件名
            header = email.header.Header(filename)
            dh = email.header.decode_header(header)
            filename = dh[0][0]
            if filename.find(b'=?') != -1 and filename.find(b'?=') != -1:
                try:
                    fsplit = filename.split(b'?')
                    fcode = fsplit[1]
                    fname = fsplit[-2]
                    filename = base64.decodebytes(fname).decode(fcode.decode())
                except:
                    pass
            if dh[0][-1]:
                try:
                    filename = filename.decode(dh[0][-1])
                except:
                    pass
            #获取附件数据
            filedata = msg.get_payload(decode=True)
            fns = filename.split('.')
            if fns[-1] == fns[-2]:
                filename = ''
                fns.pop(-1)
                for n in range(0, len(fns)):
                    filename += fns[n]
                    if n < len(fns) - 1:
                        filename += '.'
            #添加附件名和附件数据到附件字典
            parsefile['filename'].append(filename)
            parsefile['filedata'].append(filedata)
    return parsefile
Пример #16
0
def parse(data, codec=Codec):  # noqa: C901
    message = email.message_from_bytes(data)
    # ################################################################## HEADERS
    _, issuer = email.utils.parseaddr(message.get("from"))
    subject = utils_parse.normalize(message.get("subject"))
    recipient, identifier, domain = get_address_parts(message.get("to"))
    new_tag, subject = utils_parse.consume_re(codec.re_new(), subject)
    new = bool(new_tag)
    external_id, subject = utils_parse.consume_re(codec.re_external_id(),
                                                  subject)
    if not identifier and "in-reply-to" in message:
        irt = get_address_parts(message.get("in-reply-to"))
        if irt[0] == recipient and irt[2] == domain:
            if irt[1] == "new":
                new_tag = True
            else:
                identifier = irt[1]
            identifier = irt[1]
    if not identifier:
        identifier, subject = utils_parse.consume_re(codec.re_id(), subject)

    # ##################################################################### BODY
    json_part, text_part, html_part, attachments = "", "", "", []
    for part in message.walk():
        # sub-parts are iterated over in this walk
        if part.is_multipart():
            continue
        payload = part.get_payload(decode=True)
        if geojson and part.get_content_type() == "application/geo+json":
            try:
                attachments.append((part.get_content_type(),
                                    geojson.loads(payload.decode("utf-8"))))
            except ValueError:
                pass
        elif part.get_content_type() == "application/json":
            try:
                json_part = json.loads(payload.decode("utf-8"))
            except ValueError:
                json_part = None
        elif part.get_content_maintype() == "text":
            payload = payload.decode(part.get_content_charset() or "utf-8")
            # if multiple text/plain parts are given, concatenate
            if part.get_content_subtype() == "plain":
                text_part += payload
            # if no text/plain version is given,
            # get plain text version from HTML using BeautifulSoup
            if part.get_content_subtype() == "html" and not text_part:
                soup = bs4.BeautifulSoup(payload, "html.parser")
                for item in soup(["script", "style"]):
                    item.extract()
                html_part += "\n" + "\n".join(
                    l.strip() for l in soup.get_text().split("\n") if l)
            # other subtypes of txt are considered attachments
            if part.get_content_subtype() not in ("plain", "html"):
                attachments.append((part.get_content_type(), payload))
        # attachments
        elif part.get_content_maintype() in ("image", "video", "application"):
            attachments.append((part.get_content_type(), payload))

    # ################################################################# FIX TEXT
    text_part = (text_part or html_part).strip()
    # remove signature
    match = SIGNATURE_RE.search(text_part)
    if match:
        text_part = text_part[:match.start()].strip()

    # ################################################################# FIX JSON
    # if no `application/json` part was given, parse `text/plain` as YAML
    # if we manage to parse YAML, remove this part from `text/plain`
    # otherwise, leave the text part untouched
    if not json_part:
        try:
            # only parse the first YAML document provided.
            # this enables the user to provide YAML,
            # use the '---' or '...' YAML document separators
            # and provided plain text afterwards.
            json_part = next(yaml.safe_load_all(text_part))
            document_starts = [
                e for e in yaml.parse(text_part)
                if isinstance(e, yaml.events.DocumentStartEvent)
            ][1:]
            if document_starts:
                text_part = text_part[document_starts[1].end_mark.index + 1:]
            else:
                text_part = ""
        except yaml.YAMLError:
            json_part = None

    return codec.update_item(
        domain=domain,
        issuer=issuer,
        recipient=recipient,
        identifier=identifier,
        external_id=external_id,
        new=new,
        subject=subject,
        json_part=json_part,
        text_part=text_part,
        attachments=attachments,
    )