예제 #1
1
def parse_email(msg: email.message.Message, include_raw_body: bool = False, include_attachment_data: bool = False,
                pconf: typing.Optional[dict] = None) -> dict:
    """Parse an e-mail and return a dictionary containing the various parts of
    the e-mail broken down into key-value pairs.

    Args:
      msg (str): Raw EML e-mail string.
      include_raw_body (bool, optional): If True, includes the raw body in the resulting
                               dictionary. Defaults to False.
      include_attachment_data (bool, optional): If True, includes the full attachment
                                                data in the resulting dictionary.
                                                Defaults to False.
      pconf (dict, optional): A dict with various optinal configuration parameters,
                              e.g. whitelist IPs, whitelist e-mail addresses, etc.

    Returns:
      dict: A dictionary with the content of the EML parsed and broken down into
            key-value pairs.
    """
    header = {}  # type: typing.Dict[str, typing.Any]
    report_struc = {}  # type: typing.Dict[str, typing.Any]  # Final structure
    headers_struc = {}  # type: typing.Dict[str, typing.Any]  # header_structure
    bodys_struc = {}  # type: typing.Dict[str, typing.Any]  # body structure

    # If no pconf was specified, default to empty dict
    pconf = pconf or {}

    # If no whitelisting of if is required initiate the empty variable arry
    if 'whiteip' not in pconf:
        pconf['whiteip'] = []
    # If no whitelisting of if is required initiate the empty variable arry
    if 'whitefor' not in pconf:
        pconf['whitefor'] = []

    # parse and decode subject
    subject = msg.get('subject', '')
    headers_struc['subject'] = eml_parser.decode.decode_field(subject)

    # If parsing had problem... report it...
    if msg.defects:
        headers_struc['defect'] = []
        for exception in msg.defects:
            headers_struc['defect'].append(str(exception))

    # parse and decode from
    # @TODO verify if this hack is necessary for other e-mail fields as well
    try:
        msg_header_field = str(msg.get('from', '')).lower()
    except (IndexError, AttributeError):
        # We have hit current open issue #27257
        # https://bugs.python.org/issue27257
        # The field will be set to emtpy as a workaround.
        #
        logger.exception('We hit bug 27257!')

        _from = eml_parser.decode.workaround_bug_27257(msg, 'from')
        msg.__delitem__('from')

        if _from:
            msg.add_header('from', _from[0])
            __from = _from[0].lower()
        else:
            msg.add_header('from', '')
            __from = ''

        msg_header_field = __from

    if msg_header_field != '':
        m = eml_parser.regex.email_regex.search(msg_header_field)
        if m:
            headers_struc['from'] = m.group(1)
        else:
            from_ = email.utils.parseaddr(msg.get('from', '').lower())
            headers_struc['from'] = from_[1]

    # parse and decode to
    headers_struc['to'] = headeremail2list(msg, 'to')
    # parse and decode Cc
    headers_struc['cc'] = headeremail2list(msg, 'cc')
    if not headers_struc['cc']:
        headers_struc.pop('cc')

    # parse and decode delivered-to
    headers_struc['delivered_to'] = headeremail2list(msg, 'delivered-to')
    if not headers_struc['delivered_to']:
        headers_struc.pop('delivered_to')

    # parse and decode Date
    # If date field is present
    if 'date' in msg:
        try:
            headers_struc['date'] = eml_parser.decode.robust_string2date(msg.get('date'))
        except (TypeError, Exception):
            logger.warning('Error parsing date.')
            headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000')
            msg.replace_header('date', headers_struc['date'])
    else:
        # If date field is absent...
        headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000')

    # mail receiver path / parse any domain, e-mail
    # @TODO parse case where domain is specified but in parentheses only an IP
    headers_struc['received'] = []
    headers_struc['received_email'] = []
    headers_struc['received_domain'] = []
    headers_struc['received_ip'] = []
    try:
        found_smtpin = collections.Counter()  # type: collections.Counter  # Array for storing potential duplicate "HOP"

        for received_line in msg.get_all('received', []):
            line = str(received_line).lower()

            received_line_flat = re.sub(r'(\r|\n|\s|\t)+', ' ', line, flags=re.UNICODE)

            # Parse and split routing headers.
            # Return dict of list
            #   date string
            #   from list
            #   for list
            #   by list
            #   with string
            #   warning list
            parsed_routing = eml_parser.routing.parserouting(received_line_flat)

            # If required collect the IP of the gateway that have injected the mail.
            # Iterate all parsed item and find IP
            # It is parsed from the MOST recent to the OLDEST (from IN > Out)
            # We match external IP from the most "OUT" Found.
            # Warning .. It may be spoofed !!
            # It add a warning if multiple identical items are found.

            if pconf.get('byhostentry'):
                for by_item in parsed_routing.get('by', []):  # type: ignore
                    for byhostentry_ in pconf['byhostentry']:
                        byhostentry = byhostentry_.lower()
                        # print ("%s %s" % (byhostentry, by_item))
                        if byhostentry in by_item:
                            # Save the last Found.. ( most external )
                            headers_struc['received_src'] = parsed_routing.get('from')

                            # Increment watched by detection counter, and warn if needed
                            found_smtpin[byhostentry] += 1
                            if found_smtpin[byhostentry] > 1:  # Twice found the header...
                                if parsed_routing.get('warning'):
                                    parsed_routing['warning'].append(['Duplicate SMTP by entrypoint'])
                                else:
                                    parsed_routing['warning'] = ['Duplicate SMTP by entrypoint']

            headers_struc['received'].append(parsed_routing)

            # Parse IPs in "received headers"
            ips_in_received_line = eml_parser.regex.ipv6_regex.findall(received_line_flat) + \
                                   eml_parser.regex.ipv4_regex.findall(received_line_flat)
            for ip in ips_in_received_line:
                try:
                    ip_obj = ipaddress.ip_address(
                        ip)  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    logger.debug('Invalid IP in received line - "{}"'.format(ip))
                else:
                    if not (ip_obj.is_private or str(ip_obj) in pconf['whiteip']):
                        headers_struc['received_ip'].append(str(ip_obj))

            # search for domain
            for m in eml_parser.regex.recv_dom_regex.findall(received_line_flat):
                try:
                    ip_obj = ipaddress.ip_address(m)  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    # we find IPs using the previous IP crawler, hence we ignore them
                    # here.
                    # iff the regex fails, we add the entry
                    headers_struc['received_domain'].append(m)

            # search for e-mail addresses
            for mail_candidate in eml_parser.regex.email_regex.findall(received_line_flat):
                if mail_candidate not in parsed_routing.get('for', []):
                    headers_struc['received_email'] += [mail_candidate]

    except TypeError:  # Ready to parse email without received headers.
        logger.exception('Exception occured while parsing received lines.')

    # Concatenate for emails into one array | uniq
    # for rapid "find"
    headers_struc['received_foremail'] = []
    if 'received' in headers_struc:
        for _parsed_routing in headers_struc['received']:
            for itemfor in _parsed_routing.get('for', []):
                if itemfor not in pconf['whitefor']:
                    headers_struc['received_foremail'].append(itemfor)

    # Uniq data found
    headers_struc['received_email'] = list(set(headers_struc['received_email']))
    headers_struc['received_domain'] = list(set(headers_struc['received_domain']))
    headers_struc['received_ip'] = list(set(headers_struc['received_ip']))

    # Clean up if empty
    if not headers_struc['received_email']:
        del headers_struc['received_email']

    if 'received_foremail' in headers_struc:
        if not headers_struc['received_foremail']:
            del headers_struc['received_foremail']
        else:
            headers_struc['received_foremail'] = list(set(headers_struc['received_foremail']))

    if not headers_struc['received_domain']:
        del headers_struc['received_domain']

    if not headers_struc['received_ip']:
        del headers_struc['received_ip']
    ####################

    # Parse text body
    raw_body = get_raw_body_text(msg)

    if include_raw_body:
        bodys_struc['raw_body'] = raw_body

    bodys = {}
    multipart = True  # Is it a multipart email ?
    if len(raw_body) == 1:
        multipart = False  # No only "one" Part
    for body_tup in raw_body:
        bodie = {}  # type: typing.Dict[str, typing.Any]
        _, body, body_multhead = body_tup
        # Parse any URLs and mail found in the body
        list_observed_urls = []  # type: typing.List[str]
        list_observed_email = []  # type: typing.List[str]
        list_observed_dom = []  # type: typing.List[str]
        list_observed_ip = []  # type: typing.List[str]

        # If we start directly a findall on 500K+ body we got time and memory issues...
        # if more than 4K.. lets cheat, we will cut around the thing we search "://, @, ."
        # in order to reduce regex complexity.
        if len(body) < 4096:
            list_observed_urls = get_uri_ondata(body)
            for match in eml_parser.regex.email_regex.findall(body):
                list_observed_email.append(match.lower())
            for match in eml_parser.regex.dom_regex.findall(body):
                list_observed_dom.append(match.lower())
            for match in eml_parser.regex.ipv4_regex.findall(body):
                if not eml_parser.regex.priv_ip_regex.match(match):
                    if match not in pconf['whiteip']:
                        list_observed_ip.append(match)
            for match in eml_parser.regex.ipv6_regex.findall(body):
                if not eml_parser.regex.priv_ip_regex.match(match):
                    if match.lower() not in pconf['whiteip']:
                        list_observed_ip.append(match.lower())
        else:
            for scn_pt in findall('://', body):
                list_observed_urls = get_uri_ondata(body[scn_pt - 16:scn_pt + 4096]) + list_observed_urls

            for scn_pt in findall('@', body):
                # RFC 3696, 5322, 5321 for email size limitations
                for match in eml_parser.regex.email_regex.findall(body[scn_pt - 64:scn_pt + 255]):
                    list_observed_email.append(match.lower())

            for scn_pt in findall('.', body):
                # The maximum length of a fqdn, not a hostname, is 1004 characters RFC1035
                # The maximum length of a hostname is 253 characters. Imputed from RFC952, RFC1123 and RFC1035.
                for match in eml_parser.regex.dom_regex.findall(body[scn_pt - 253:scn_pt + 1004]):
                    list_observed_dom.append(match.lower())

                # Find IPv4 addresses
                for match in eml_parser.regex.ipv4_regex.findall(body[scn_pt - 11:scn_pt + 3]):
                    if not eml_parser.regex.priv_ip_regex.match(match):
                        if match not in pconf['whiteip']:
                            list_observed_ip.append(match)

            for scn_pt in findall(':', body):
                # The maximum length of IPv6 is 32 Char + 7 ":"
                for match in eml_parser.regex.ipv6_regex.findall(body[scn_pt - 4:scn_pt + 35]):
                    if not eml_parser.regex.priv_ip_regex.match(match):
                        if match.lower() not in pconf['whiteip']:
                            list_observed_ip.append(match.lower())

        # Report uri,email and observed domain or hash if no raw body
        if include_raw_body:
            if list_observed_urls:
                bodie['uri'] = list(set(list_observed_urls))

            if list_observed_email:
                bodie['email'] = list(set(list_observed_email))

            if list_observed_dom:
                bodie['domain'] = list(set(list_observed_dom))

            if list_observed_ip:
                bodie['ip'] = list(set(list_observed_ip))

        else:
            if list_observed_urls:
                bodie['uri_hash'] = []
                for uri in list(set(list_observed_urls)):
                    bodie['uri_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_email:
                bodie['email_hash'] = []
                for emel in list(set(list_observed_email)):
                    # Email already lowered
                    bodie['email_hash'].append(wrap_hash_sha256(emel))
            if list_observed_dom:
                bodie['domain_hash'] = []
                for uri in list(set(list_observed_dom)):
                    bodie['domain_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_ip:
                bodie['ip_hash'] = []
                for fip in list(set(list_observed_ip)):
                    # IP (v6) already lowered
                    bodie['ip_hash'].append(wrap_hash_sha256(fip))

        # For mail without multipart we will only get the "content....something" headers
        # all other headers are in "header"
        # but we need to convert header tuples in dict..
        # "a","toto"           a: [toto,titi]
        # "a","titi"   --->    c: [truc]
        # "c","truc"
        ch = {}  # type: typing.Dict[str, typing.List]
        for k, v in body_multhead:
            # make sure we are working with strings only
            v = str(v)

            # We are using replace . to : for avoiding issue in mongo
            k = k.lower().replace('.', ':')  # Lot of lowers, precompute :) .
            # print v
            if multipart:
                if k in ch:
                    ch[k].append(v)
                else:
                    ch[k] = [v]
            else:  # if not multipart, store only content-xx related header with part
                if k.startswith('content'):  # otherwise, we got all header headers
                    k = k.lower().replace('.', ':')
                    if k in ch:
                        ch[k].append(v)
                    else:
                        ch[k] = [v]
        bodie['content_header'] = ch  # Store content headers dict

        if include_raw_body:
            bodie['content'] = body

        # Sometimes bad people play with multiple header instances.
        # We "display" the "LAST" one .. as does thunderbird
        val = ch.get('content-type')
        if val:
            header_val = val[-1]
            bodie['content_type'] = header_val.split(';', 1)[0].strip()

        # Hash the body
        bodie['hash'] = hashlib.sha256(body.encode('utf-8')).hexdigest()

        uid = str(uuid.uuid1())
        bodys[uid] = bodie

    bodys_struc = bodys

    # Get all other bulk raw headers
    # "a","toto"           a: [toto,titi]
    # "a","titi"   --->    c: [truc]
    # "c","truc"
    #
    for k in set(msg.keys()):
        # We are using replace . to : for avoiding issue in mongo
        k = k.lower()  # Lot of lower, precompute...
        decoded_values = []

        try:
            for value in msg.get_all(k, []):
                if value:
                    decoded_values.append(value)
        except (IndexError, AttributeError):
            # We have hit current open issue #27257
            # https://bugs.python.org/issue27257
            # The field will be set to emtpy as a workaround.
            logger.exception('We hit bug 27257!')

            decoded_values = eml_parser.decode.workaround_bug_27257_field_value(msg, k)

            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

        if decoded_values:
            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

    headers_struc['header'] = header

    # parse attachments
    try:
        report_struc['attachment'] = traverse_multipart(msg, 0, include_attachment_data)
    except (binascii.Error, AssertionError):
        # we hit this exception if the payload contains invalid data
        logger.exception('Exception occured while parsing attachment data. Collected data will not be complete!')
        report_struc['attachment'] = None

    # Dirty hack... transform hash into list.. need to be done in the function.
    # Mandatory to search efficiently in mongodb
    # See Bug 11 of eml_parser
    if not report_struc['attachment']:
        del report_struc['attachment']
    else:
        newattach = []
        for attachment in report_struc['attachment']:
            newattach.append(report_struc['attachment'][attachment])
        report_struc['attachment'] = newattach

    newbody = []
    for body in bodys_struc:
        newbody.append(bodys_struc[body])
    report_struc['body'] = newbody
    # End of dirty hack

    # Get all other bulk headers
    report_struc['header'] = headers_struc

    return report_struc
예제 #2
0
def checkContainsArchive(msg: email.message.Message):
    """检查对应的信封是否是Azazo的传输文件"""
    try:
        if (msg.get(Const.PROJECT_NAME_HEADER) and
                msg.get(Const.PROJECT_VERSION_HEADER)):
            return True
    except Exception:
        traceback.print_exc()
예제 #3
0
def extract_files_from_email(
        message: email.message.Message) -> ParseMessageResult:
    """
    Parses an email Message and returns a ParseMessageResult:
    1. Is there a subject and from in the email? If no, return.
    2. Does the subject contain the word "unsubscribe"? If yes, return.
    3. Is the subject 8 letters long? If yes, return.
    4. Otherwise:
        - Walk through the message and grab all parts that match
          "application/pdf;" or "application/epub".
        - Return a ParseMessageResult with those files.
    """
    subject: str = message.get("Subject")
    sent_from: str = message.get("From")
    assert subject and sent_from
    if "unsubscribe" in subject.lower():
        return ParseMessageResult(
            sent_from=sent_from,
            subject=subject,
            status=MessageStatus.UNSUBSCRIBE,
            extracted_files=[],
        )
    # FIXME: need a more robust check here
    if subject and len(subject) == 8:
        return ParseMessageResult(
            sent_from=sent_from,
            subject=subject,
            status=MessageStatus.REGISTER,
            extracted_files=[],
        )
    # Now we're done parsing the subject, we should check if there are any attachments
    files: List[FileTuple] = []
    for part in message.walk():
        if "application/pdf;" in part["Content-Type"]:
            filename = part.get_filename() or "Remailable_Attachment.pdf"
            filebytes = base64.b64decode(part.get_payload())
            assert type(filename) == str
            files.append((filename, filebytes))
        elif "application/epub" in part["Content-Type"]:
            filename = part.get_filename() or "Remailable_Attachment.epub"
            filebytes = base64.b64decode(part.get_payload())
            assert type(filename) == str
            files.append((filename, filebytes))
    if files:
        return ParseMessageResult(
            sent_from=sent_from,
            subject=subject,
            status=MessageStatus.SUCCESS,
            extracted_files=files,
        )
    else:
        # Couldn't parse any files, empty
        return ParseMessageResult(
            sent_from=sent_from,
            subject=subject,
            status=MessageStatus.FAILURE,
            extracted_files=files,
        )
예제 #4
0
def get_raw_body_text(
    msg: email.message.Message
) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]:
    """This method recursively retrieves all e-mail body parts and returns them as a list.

    Args:
        msg (email.message.Message): The actual e-mail message or sub-message.

    Returns:
        list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)"
    """
    raw_body = [
    ]  # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]]

    if msg.is_multipart():
        for part in msg.get_payload():  # type: ignore
            raw_body.extend(get_raw_body_text(part))  # type: ignore
    else:
        # Treat text document attachments as belonging to the body of the mail.
        # Attachments with a file-extension of .htm/.html are implicitely treated
        # as text as well in order not to escape later checks (e.g. URL scan).

        try:
            filename = msg.get_filename('').lower()
        except (binascii.Error, AssertionError):
            logger.exception(
                'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.'
            )
            filename = ''

        if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') \
            or (filename.endswith('.html') or filename.endswith('.htm')):
            encoding = msg.get('content-transfer-encoding', '').lower()

            charset = msg.get_content_charset()
            if charset is None:
                raw_body_str = msg.get_payload(decode=True)
                raw_body_str = eml_parser.decode.decode_string(
                    raw_body_str, None)
            else:
                try:
                    raw_body_str = msg.get_payload(decode=True).decode(
                        charset, 'ignore')
                except Exception:
                    logger.debug(
                        'An exception occured while decoding the payload!',
                        exc_info=True)
                    raw_body_str = msg.get_payload(decode=True).decode(
                        'ascii', 'ignore')

            # In case we hit bug 27257, try to downgrade the used policy
            try:
                raw_body.append((encoding, raw_body_str, msg.items()))
            except AttributeError:
                former_policy = msg.policy
                msg.policy = email.policy.compat32
                raw_body.append((encoding, raw_body_str, msg.items()))
                msg.policy = former_policy

    return raw_body
예제 #5
0
def hash_headers(mail: email.message.Message, header_to_hash: str, bh: str) -> SHA256.SHA256Hash:
    # mail: email.message.Message object
    # header_to_hash: list of email headers to hash seperated by a colon
    # bh: body hash of the email body base64 encoded
    #
    # in:  'from:from:reply-to:subject:subject:date:date:message-id:message-id:to:to:cc:content-type:content-type:content-transfer-encoding:content-transfer-encoding'
    # build "from:Christian Schneider <*****@*****.**>\r\n..."
    #

    header_to_hash_list = header_to_hash.split(":")
    headers = ""

    for header in header_to_hash_list:
        if mail[header] and header in header_to_hash_list:
            headers += header.lower() + ":" + mail[header].strip() + "\r\n"
            header_to_hash_list.remove(header) # strip duplicate header like the from

    dkim_header = mail.get("DKIM-Signature")
    dkim_header = re.sub(r'(\n|\r)', "", dkim_header)
    dkim_header = re.sub(r'\s+', " ", dkim_header)
    headers += "dkim-signature:{}\r\n".format(dkim_header)
    headers = re.sub(r'b=[\w0-9\s/+=]+', "b=", headers) #replace b=... with be=

    hheader = SHA256.new(headers.encode())
    assert hheader.hexdigest() == "5188ff42a5ab71ae70236cf66822ab963b0977a3e7d932237fbfc35005195720"
    return hheader
예제 #6
0
def get_by_msg(msg: email.message.Message, attr: str, decode=False) -> Union[bytes, str]:
    """get attribute from msg"""
    get = msg.get(attr)
    if decode:
        get, charset = email.header.decode_header(get)
        if charset:
            get = get.decode(charset)
    return get
예제 #7
0
def get_raw_body_text(msg: email.message.Message) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]:
    """This method recursively retrieves all e-mail body parts and returns them as a list.

    Args:
        msg (email.message.Message): The actual e-mail message or sub-message.

    Returns:
        list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)"
    """
    raw_body = []  # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]]

    if msg.is_multipart():
        for part in msg.get_payload():  # type: ignore
            raw_body.extend(get_raw_body_text(part))  # type: ignore
    else:
        # Treat text document attachments as belonging to the body of the mail.
        # Attachments with a file-extension of .htm/.html are implicitely treated
        # as text as well in order not to escape later checks (e.g. URL scan).

        try:
            filename = msg.get_filename('').lower()
        except (binascii.Error, AssertionError):
            logger.exception(
                'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.')
            filename = ''

        if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') or (
                filename.endswith('.html') or filename.endswith('.htm')):
            encoding = msg.get('content-transfer-encoding', '').lower()

            charset = msg.get_content_charset()
            if charset is None:
                raw_body_str = msg.get_payload(decode=True)
                raw_body_str = eml_parser.decode.decode_string(raw_body_str, None)
            else:
                try:
                    raw_body_str = msg.get_payload(decode=True).decode(charset, 'ignore')
                except Exception:
                    logger.debug('An exception occured while decoding the payload!', exc_info=True)
                    raw_body_str = msg.get_payload(decode=True).decode('ascii', 'ignore')

            # In case we hit bug 27257, try to downgrade the used policy
            try:
                raw_body.append((encoding, raw_body_str, msg.items()))
            except AttributeError:
                former_policy = msg.policy
                msg.policy = email.policy.compat32
                raw_body.append((encoding, raw_body_str, msg.items()))
                msg.policy = former_policy

    return raw_body
예제 #8
0
def extract_pdf(message: email.message.Message) -> Tuple[str, bytes]:
    """
    Get a PDF from the email.

    TODO: This is the thing to change to accommodate more than one PDF per msg.
    """
    filename = None
    filebytes = None
    for part in message.walk():
        if "application/pdf;" in part["Content-Type"]:
            filename = part.get_filename() or "Remailable_Attachment.pdf"
            filebytes = base64.b64decode(part.get_payload())
            break
    else:
        # Let's try getting the subjectline and body and see if there's a code
        # for us to gobble up in there :)
        code = message.get("Subject")
        if code and len(code) == 8:
            register_user(message.get("From"), code)
            return True
        else:
            raise ValueError("No PDF in this message.")

    return (filename, filebytes)
예제 #9
0
 def from_email(cls, m: email.message.Message):
     return Message(
         message_id=m.get("message-id", ""),
         in_reply_to=m.get("in-reply-to", ""),
         date=_parsedate(m),
         email_from=email.utils.parseaddr(m.get("from", "")),
         email_to=email.utils.parseaddr(m.get("to", "")),
         carbon_copy=email.utils.getaddresses(m.get_all("cc", [])),
         subject=m.get("subject", ""),
         reply_to=m.get("reply-to", ""),
     )
예제 #10
0
def extract_pdf(message: email.message.Message) -> Tuple[str, bytes]:
    """
    Get a PDF from the email.

    TODO: This is the thing to change to accommodate more than one PDF per msg.
    """

    # Handle unsubscribes:
    subject = message.get("Subject")
    if "unsubscribe" in subject.lower():
        plog(f"Permanently removing user {message.get('From')}.")
        delete_user(message.get("From"))
        return (False, False)

    filename = None
    filebytes = None
    for part in message.walk():
        if "application/pdf;" in part["Content-Type"]:
            filename = part.get_filename() or "Remailable_Attachment.pdf"
            filebytes = base64.b64decode(part.get_payload())
            break
    else:
        # Let's try getting the subjectline and body and see if there's a code
        # for us to gobble up in there :)
        code = message.get("Subject")
        if code and len(code) == 8:
            register_user(message.get("From"), code)
            plog(f"Registered a new user {message.get('From')}.")
            send_email_if_enabled(
                message.get("From"),
                subject="Your email address is now verified!",
                message=
                "Your verification succeeded, and you can now email documents to your reMarkable tablet. Try responding to this email with a PDF attachment!",
            )
            return (False, False)
        else:
            send_email_if_enabled(
                message.get("From"),
                subject="A problem with your document :(",
                message=
                "Unfortunately, a problem occurred while processing your email. Remailable only supports PDF attachments for now. If you're still encountering issues, please get in touch with Jordan at [email protected] or on Twitter at @j6m8.",
            )
            plog(
                f"ERROR: Encountered no PDF in message from {message.get('From')}"
            )
            return (False, False)

    return (filename, filebytes)
예제 #11
0
 def __init__(self, message: email.message.Message, uid=None):
     self.msgraw = message
     message = email.message_from_bytes(message)
     # print(email.header.decode_header(message['subject']))
     self.uid = uid
     self.recipients = message.get('to', '').replace('\n', '').replace(
         '\r', '').replace('\t', ' ').split(', ')
     self.sender = message.get('from', '')
     self.subject = self.soft_decode(message.get('subject', ''))
     self.date = message.get('date', '')
     self.cc = message.get('cc', '').replace('\n',
                                             '').replace('\r', '').replace(
                                                 '\t', ' ').split(', ')
     self.bcc = message.get('bcc',
                            '').replace('\n', '').replace('\r', '').replace(
                                '\t', ' ').split(', ')
     self.server_uid = message.get('message-id', '').strip()
     self.attachments = self.resolve_attachments(message)
     self.body = self.decode_body(message) or ''
     self.type = 'html' if isinstance(self.body, BeautifulSoup) else 'text'
예제 #12
0
def parse_email(msg: email.message.Message,
                include_raw_body: bool = False,
                include_attachment_data: bool = False,
                pconf: typing.Optional[dict] = None,
                parse_attachments: bool = True) -> dict:
    """Parse an e-mail and return a dictionary containing the various parts of
    the e-mail broken down into key-value pairs.

    Args:
      msg (str): Raw EML e-mail string.
      include_raw_body (bool, optional): If True, includes the raw body in the resulting
                               dictionary. Defaults to False.
      include_attachment_data (bool, optional): If True, includes the full attachment
                                                data in the resulting dictionary.
                                                Defaults to False.
      pconf (dict, optional): A dict with various optional configuration parameters,
                              e.g. whitelist IPs, whitelist e-mail addresses, etc.

      parse_attachments (bool, optional): Set this to false if you want to disable the parsing of attachments.
                                          Please note that HTML attachments as well as other text data marked to be
                                          in-lined, will always be parsed.

    Returns:
      dict: A dictionary with the content of the EML parsed and broken down into
            key-value pairs.
    """
    header = {}  # type: typing.Dict[str, typing.Any]
    report_struc = {}  # type: typing.Dict[str, typing.Any]  # Final structure
    headers_struc = {
    }  # type: typing.Dict[str, typing.Any]  # header_structure
    bodys_struc = {}  # type: typing.Dict[str, typing.Any]  # body structure

    # If no pconf was specified, default to empty dict
    pconf = pconf or {}

    # If no whitelisting is required, set to emtpy list
    if 'whiteip' not in pconf:
        pconf['whiteip'] = []
    # If no whitelisting is required, set to emtpy list
    if 'whitefor' not in pconf:
        pconf['whitefor'] = []

    # parse and decode subject
    subject = msg.get('subject', '')
    headers_struc['subject'] = eml_parser.decode.decode_field(subject)

    # If parsing had problems, report it
    if msg.defects:
        headers_struc['defect'] = []
        for exception in msg.defects:
            headers_struc['defect'].append(str(exception))

    # parse and decode "from"
    # @TODO verify if this hack is necessary for other e-mail fields as well
    try:
        msg_header_field = str(msg.get('from', '')).lower()
    except (IndexError, AttributeError):
        # We have hit current open issue #27257
        # https://bugs.python.org/issue27257
        # The field will be set to emtpy as a workaround.
        #
        logger.exception('We hit bug 27257!')

        _from = eml_parser.decode.workaround_bug_27257(msg, 'from')
        msg.__delitem__('from')

        if _from:
            msg.add_header('from', _from[0])
            __from = _from[0].lower()
        else:
            msg.add_header('from', '')
            __from = ''

        msg_header_field = __from

    if msg_header_field != '':
        m = eml_parser.regex.email_regex.search(msg_header_field)
        if m:
            headers_struc['from'] = m.group(1)
        else:
            from_ = email.utils.parseaddr(msg.get('from', '').lower())
            headers_struc['from'] = from_[1]

    # parse and decode "to"
    headers_struc['to'] = headeremail2list(msg, 'to')
    # parse and decode "cc"
    headers_struc['cc'] = headeremail2list(msg, 'cc')
    if not headers_struc['cc']:
        headers_struc.pop('cc')

    # parse and decode delivered-to
    headers_struc['delivered_to'] = headeremail2list(msg, 'delivered-to')
    if not headers_struc['delivered_to']:
        headers_struc.pop('delivered_to')

    # parse and decode Date
    # If date field is present
    if 'date' in msg:
        try:
            headers_struc['date'] = eml_parser.decode.robust_string2date(
                msg.get('date'))
        except (TypeError, Exception):
            logger.warning('Error parsing date.')
            headers_struc['date'] = dateutil.parser.parse(
                '1970-01-01T00:00:00+0000')
            msg.replace_header('date', headers_struc['date'])
    else:
        # If date field is absent...
        headers_struc['date'] = dateutil.parser.parse(
            '1970-01-01T00:00:00+0000')

    # mail receiver path / parse any domain, e-mail
    # @TODO parse case where domain is specified but in parentheses only an IP
    headers_struc['received'] = []
    headers_struc['received_email'] = []
    headers_struc['received_domain'] = []
    headers_struc['received_ip'] = []
    try:
        found_smtpin = collections.Counter(
        )  # type: collections.Counter  # Array for storing potential duplicate "HOP"

        for received_line in msg.get_all('received', []):
            line = str(received_line).lower()

            received_line_flat = re.sub(r'(\r|\n|\s|\t)+',
                                        ' ',
                                        line,
                                        flags=re.UNICODE)

            # Parse and split routing headers.
            # Return dict of list
            #   date string
            #   from list
            #   for list
            #   by list
            #   with string
            #   warning list
            parsed_routing = eml_parser.routing.parserouting(
                received_line_flat)

            # If required collect the IP of the gateway that have injected the mail.
            # Iterate all parsed item and find IP
            # It is parsed from the MOST recent to the OLDEST (from IN > Out)
            # We match external IP from the most "OUT" Found.
            # Warning .. It may be spoofed !!
            # It add a warning if multiple identical items are found.

            if pconf.get('byhostentry'):
                for by_item in parsed_routing.get('by', []):  # type: ignore
                    for byhostentry_ in pconf['byhostentry']:
                        byhostentry = byhostentry_.lower()
                        # print ("%s %s" % (byhostentry, by_item))
                        if byhostentry in by_item:
                            # Save the last Found.. ( most external )
                            headers_struc['received_src'] = parsed_routing.get(
                                'from')

                            # Increment watched by detection counter, and warn if needed
                            found_smtpin[byhostentry] += 1
                            if found_smtpin[
                                    byhostentry] > 1:  # Twice found the header...
                                if parsed_routing.get('warning'):
                                    parsed_routing['warning'].append(
                                        ['Duplicate SMTP by entrypoint'])
                                else:
                                    parsed_routing['warning'] = [
                                        'Duplicate SMTP by entrypoint'
                                    ]

            headers_struc['received'].append(parsed_routing)

            # Parse IPs in "received headers"
            ips_in_received_line = eml_parser.regex.ipv6_regex.findall(received_line_flat) + \
                                   eml_parser.regex.ipv4_regex.findall(received_line_flat)
            for ip in ips_in_received_line:
                try:
                    ip_obj = ipaddress.ip_address(
                        ip
                    )  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    logger.debug(
                        'Invalid IP in received line - "{}"'.format(ip))
                else:
                    if not (ip_obj.is_private
                            or str(ip_obj) in pconf['whiteip']):
                        headers_struc['received_ip'].append(str(ip_obj))

            # search for domain
            for m in eml_parser.regex.recv_dom_regex.findall(
                    received_line_flat):
                try:
                    ip_obj = ipaddress.ip_address(
                        m
                    )  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    # we find IPs using the previous IP crawler, hence we ignore them
                    # here.
                    # iff the regex fails, we add the entry
                    headers_struc['received_domain'].append(m)

            # search for e-mail addresses
            for mail_candidate in eml_parser.regex.email_regex.findall(
                    received_line_flat):
                if mail_candidate not in parsed_routing.get('for', []):
                    headers_struc['received_email'] += [mail_candidate]

    except TypeError:  # Ready to parse email without received headers.
        logger.exception('Exception occured while parsing received lines.')

    # Concatenate for emails into one array | uniq
    # for rapid "find"
    headers_struc['received_foremail'] = []
    if 'received' in headers_struc:
        for _parsed_routing in headers_struc['received']:
            for itemfor in _parsed_routing.get('for', []):
                if itemfor not in pconf['whitefor']:
                    headers_struc['received_foremail'].append(itemfor)

    # Uniq data found
    headers_struc['received_email'] = list(set(
        headers_struc['received_email']))
    headers_struc['received_domain'] = list(
        set(headers_struc['received_domain']))
    headers_struc['received_ip'] = list(set(headers_struc['received_ip']))

    # Clean up if empty
    if not headers_struc['received_email']:
        del headers_struc['received_email']

    if 'received_foremail' in headers_struc:
        if not headers_struc['received_foremail']:
            del headers_struc['received_foremail']
        else:
            headers_struc['received_foremail'] = list(
                set(headers_struc['received_foremail']))

    if not headers_struc['received_domain']:
        del headers_struc['received_domain']

    if not headers_struc['received_ip']:
        del headers_struc['received_ip']
    ####################

    # Parse text body
    raw_body = get_raw_body_text(msg)

    if include_raw_body:
        bodys_struc['raw_body'] = raw_body

    bodys = {}

    # Is it a multipart email ?
    if len(raw_body) == 1:
        multipart = False
    else:
        multipart = True

    for body_tup in raw_body:
        bodie = {}  # type: typing.Dict[str, typing.Any]
        _, body, body_multhead = body_tup
        # Parse any URLs and mail found in the body
        list_observed_urls = []  # type: typing.List[str]
        list_observed_email = []  # type: typing.List[str]
        list_observed_dom = []  # type: typing.List[str]
        list_observed_ip = []  # type: typing.List[str]

        # If we start directly a findall on 500K+ body we got time and memory issues...
        # if more than 4K.. lets cheat, we will cut around the thing we search "://, @, ."
        # in order to reduce regex complexity.
        if len(body) < 4096:
            list_observed_urls = get_uri_ondata(body)
            for match in eml_parser.regex.email_regex.findall(body):
                list_observed_email.append(match.lower())
            for match in eml_parser.regex.dom_regex.findall(body):
                list_observed_dom.append(match.lower())
            for match in eml_parser.regex.ipv4_regex.findall(body):
                if not eml_parser.regex.priv_ip_regex.match(match):
                    if match not in pconf['whiteip']:
                        list_observed_ip.append(match)
            for match in eml_parser.regex.ipv6_regex.findall(body):
                if not eml_parser.regex.priv_ip_regex.match(match):
                    if match.lower() not in pconf['whiteip']:
                        list_observed_ip.append(match.lower())
        else:
            for scn_pt in findall('://', body):
                list_observed_urls = get_uri_ondata(
                    body[scn_pt - 16:scn_pt + 4096]) + list_observed_urls

            for scn_pt in findall('@', body):
                # RFC 3696, 5322, 5321 for email size limitations
                for match in eml_parser.regex.email_regex.findall(
                        body[scn_pt - 64:scn_pt + 255]):
                    list_observed_email.append(match.lower())

            for scn_pt in findall('.', body):
                # The maximum length of a fqdn, not a hostname, is 1004 characters RFC1035
                # The maximum length of a hostname is 253 characters. Imputed from RFC952, RFC1123 and RFC1035.
                for match in eml_parser.regex.dom_regex.findall(
                        body[scn_pt - 253:scn_pt + 1004]):
                    list_observed_dom.append(match.lower())

                # Find IPv4 addresses
                for match in eml_parser.regex.ipv4_regex.findall(
                        body[scn_pt - 11:scn_pt + 3]):
                    if not eml_parser.regex.priv_ip_regex.match(match):
                        if match not in pconf['whiteip']:
                            list_observed_ip.append(match)

            for scn_pt in findall(':', body):
                # The maximum length of IPv6 is 32 Char + 7 ":"
                for match in eml_parser.regex.ipv6_regex.findall(
                        body[scn_pt - 4:scn_pt + 35]):
                    if not eml_parser.regex.priv_ip_regex.match(match):
                        if match.lower() not in pconf['whiteip']:
                            list_observed_ip.append(match.lower())

        # Report uri,email and observed domain or hash if no raw body
        if include_raw_body:
            if list_observed_urls:
                bodie['uri'] = list(set(list_observed_urls))

            if list_observed_email:
                bodie['email'] = list(set(list_observed_email))

            if list_observed_dom:
                bodie['domain'] = list(set(list_observed_dom))

            if list_observed_ip:
                bodie['ip'] = list(set(list_observed_ip))

        else:
            if list_observed_urls:
                bodie['uri_hash'] = []
                for uri in list(set(list_observed_urls)):
                    bodie['uri_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_email:
                bodie['email_hash'] = []
                for emel in list(set(list_observed_email)):
                    # Email already lowered
                    bodie['email_hash'].append(wrap_hash_sha256(emel))
            if list_observed_dom:
                bodie['domain_hash'] = []
                for uri in list(set(list_observed_dom)):
                    bodie['domain_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_ip:
                bodie['ip_hash'] = []
                for fip in list(set(list_observed_ip)):
                    # IP (v6) already lowered
                    bodie['ip_hash'].append(wrap_hash_sha256(fip))

        # For mail without multipart we will only get the "content....something" headers
        # all other headers are in "header"
        # but we need to convert header tuples in dict..
        # "a","toto"           a: [toto,titi]
        # "a","titi"   --->    c: [truc]
        # "c","truc"
        ch = {}  # type: typing.Dict[str, typing.List]
        for k, v in body_multhead:
            # make sure we are working with strings only
            v = str(v)

            # We are using replace . to : for avoiding issue in mongo
            k = k.lower().replace('.', ':')  # Lot of lowers, pre-compute :) .
            # print v
            if multipart:
                if k in ch:
                    ch[k].append(v)
                else:
                    ch[k] = [v]
            else:  # if not multipart, store only content-xx related header with part
                if k.startswith(
                        'content'):  # otherwise, we got all header headers
                    if k in ch:
                        ch[k].append(v)
                    else:
                        ch[k] = [v]
        bodie['content_header'] = ch  # Store content headers dict

        if include_raw_body:
            bodie['content'] = body

        # Sometimes bad people play with multiple header instances.
        # We "display" the "LAST" one .. as does thunderbird
        val = ch.get('content-type')
        if val:
            header_val = val[-1]
            bodie['content_type'] = header_val.split(';', 1)[0].strip()

        # Hash the body
        bodie['hash'] = hashlib.sha256(body.encode('utf-8')).hexdigest()

        uid = str(uuid.uuid1())
        bodys[uid] = bodie

    bodys_struc = bodys

    # Get all other bulk raw headers
    # "a","toto"           a: [toto,titi]
    # "a","titi"   --->    c: [truc]
    # "c","truc"
    #
    for k in set(msg.keys()):
        # We are using replace . to : for avoiding issue in mongo
        k = k.lower()  # Lot of lower, precompute...
        decoded_values = []

        try:
            for value in msg.get_all(k, []):
                if value:
                    decoded_values.append(value)
        except (IndexError, AttributeError):
            # We have hit current open issue #27257
            # https://bugs.python.org/issue27257
            # The field will be set to emtpy as a workaround.
            logger.exception('We hit bug 27257!')

            decoded_values = eml_parser.decode.workaround_bug_27257_field_value(
                msg, k)

            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

        if decoded_values:
            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

    headers_struc['header'] = header

    # parse attachments
    if parse_attachments:
        try:
            report_struc['attachment'] = traverse_multipart(
                msg, 0, include_attachment_data)
        except (binascii.Error, AssertionError):
            # we hit this exception if the payload contains invalid data
            logger.exception(
                'Exception occured while parsing attachment data. Collected data will not be complete!'
            )
            report_struc['attachment'] = None

        # Dirty hack... transform hash into list.. need to be done in the function.
        # Mandatory to search efficiently in mongodb
        # See Bug 11 of eml_parser
        if not report_struc['attachment']:
            del report_struc['attachment']
        else:
            newattach = []
            for attachment in report_struc['attachment']:
                newattach.append(report_struc['attachment'][attachment])
            report_struc['attachment'] = newattach

    newbody = []
    for body in bodys_struc:
        newbody.append(bodys_struc[body])
    report_struc['body'] = newbody
    # End of dirty hack

    # Get all other bulk headers
    report_struc['header'] = headers_struc

    return report_struc
예제 #13
0
def _parsedate(m: email.message.Message) -> datetime.datetime:
    parsed = email.utils.parsedate_to_datetime(m.get("date", ""))
    return parsed.astimezone(datetime.timezone.utc)