def parse_body(body): """ Parse the body from the email and extract the required fields. Need to extract sender email, subject of the email, the receive date, and body of the email. """ msg = BytesParser(policy=policy.SMTP).parsebytes(body) print("This is the message: ", msg.keys()) print("From : ",msg['From']) print("Date: ",msg['Date']) print("To: ",msg['To']) print("Subject : ",msg['Subject']) plain = '' try: plain = msg.get_body(preferencelist=('plain')) plain = ''.join(plain.get_content().splitlines(keepends=True)) plain = '' if plain == None else plain except: print('Incoming message does not have an plain text part - skipping this part.') return { 'from': msg['From'], 'to': msg['To'], 'subject': msg['Subject'], 'date': msg['Date'], 'text':plain }
def process_probe(row): if not row["data"].startswith(b"HTTP/"): return {} # TODO: do some kind of content analysis #print(row["data"], "\n") response = row["data"].replace(b"\r\n\r\n", b"\n\n", 1) try: # split in headers and content raw_headers, content = response.split(b"\n\n", 1) request_line, headers_alone = raw_headers.split(b"\r\n", 1) except ValueError as e: return {} # parse first line try: protocol, status_code, status_text, version = None, None, None, None protocol, status_code, status_text = request_line.split(b" ", 2) protocol, version = protocol.split(b"/", 1) except ValueError as e: pass # get headers headers = BytesParser().parsebytes(headers_alone) server = headers.get("Server", "") date = headers.get("Date", "") content_type = headers.get("Content-Type", "") transfer_encoding = list( map(lambda s: s.strip(), headers.get("Transfer-Encoding", "").split(","))) charset = "utf-8" if "charset=" in content_type: charset = content_type[content_type.find("charset=") + len("charset="):] if charset == "undef": charset = "utf-8" try: codecs.lookup(charset) except LookupError: charset = "utf-8" if "chunked" in transfer_encoding: # the content is chunked and needs to be merged content = merge_chunks(content) # parse html tag_tree = "" try: tree = html.fromstring(content) tag_tree = tag_recursive(tree) except ParserError as e: pass data = {} probe_type = row["type"] try: # TODO: IIS server is dick and may return decimals in status_code :shrug: try: data["{}:status_code".format(probe_type)] = float(status_code) except ValueError: data["{}:status_code".format(probe_type)] = -1 except TypeError: data["{}:status_code".format(probe_type)] = None try: data["{}:status_text".format(probe_type)] = status_text except AttributeError: data["{}:status_text".format(probe_type)] = None try: data["{}:header_keys".format(probe_type)] = headers.keys() except TypeError: data["{}:header_keys".format(probe_type)] = None for header in headers: data["{}:header:{}".format(probe_type, header)] = headers[header] data["{}:dom_tree".format(probe_type)] = tag_tree return data