def setUp(self): self.f = utils.reformat_output p = MailParser() p.parse_from_file(mail) self.mail_obj = p.parsed_mail_obj self.mail_obj['analisys_date'] = datetime.datetime.utcnow().isoformat() s = sp.SampleParser( tika_enabled=True, tika_jar="/opt/tika/tika-app-1.14.jar", tika_memory_allocation=None, tika_valid_content_types=['application/zip']) self.attachments = [] for i in p.attachments_list: s.parse_sample_from_base64( data=i['payload'], filename=i['filename'], mail_content_type=i['mail_content_type'], transfer_encoding=i['content_transfer_encoding']) self.attachments.append(s.result) self.parameters = { 'elastic_index_mail': "spamscope_mails-", 'elastic_type_mail': "spamscope", 'elastic_index_attach': "spamscope_attachments-", 'elastic_type_attach': "spamscope"}
def test_meta_data(self): """Test meta data analysis.""" # Parsing mail p = MailParser() p.parse_from_file(mail) # Init parameters new_attachments = [] s = sp.SampleParser() t = sp.TikaAnalysis(jar="/opt/tika/tika-app-1.13.jar") # Parsing sample for i in p.attachments_list: s.parse_sample_from_base64( data=i['payload'], filename=i['filename'], mail_content_type=i['mail_content_type'], transfer_encoding=i['content_transfer_encoding']) if s.result: new_attachments.append(s.result) # Tika analysis for i in new_attachments: t.add_meta_data(i) self.assertNotIn('tika', i) t = sp.TikaAnalysis(jar="/opt/tika/tika-app-1.13.jar", valid_content_types=["application/zip"]) for i in new_attachments: t.add_meta_data(i) self.assertIn('tika', i)
def test_add_analysis(self): """Test add VirusTotal analysis.""" # Parsing mail p = MailParser() p.parse_from_file(mail) # Init parameters new_attachments = [] s = sp.SampleParser() v = sp.VirusTotalAnalysis(api_key=API_KEY) # Parsing sample for i in p.attachments_list: s.parse_sample_from_base64( data=i['payload'], filename=i['filename'], mail_content_type=i['mail_content_type'], transfer_encoding=i['content_transfer_encoding']) if s.result: new_attachments.append(s.result) # VirusTotal analysis for i in new_attachments: v.add_analysis(i) self.assertIn('virustotal', i) for j in i["files"]: self.assertIn('virustotal', j)
def setUp(self): # Init p = MailParser() p.parse_from_file(mail) self.attachments = p.attachments_list p.parse_from_file(mail_thug) self.attachments_thug = p.attachments_list
def main(): args = get_args() parser = MailParser() if args.file: parser.parse_from_file(args.file) elif args.string: parser.parse_from_string(args.string) if args.json: j = json.loads(parser.parsed_mail_json) safe_print(json.dumps(j, ensure_ascii=False, indent=4)) if args.body: # safe_print(parser.body) safe_print(parser.body) if args.headers: safe_print(parser.headers) if args.to: safe_print(parser.to_) if args.from_: safe_print(parser.from_) if args.subject: safe_print(parser.subject) if args.defects: for i in parser.defects_category: safe_print(i) if args.anomalies: for i in parser.anomalies: safe_print(i) if args.senderip: r = parser.get_server_ipaddress(args.senderip) if r: safe_print(r) else: safe_print("Not Found") if args.attachments: for i in parser.attachments_list: safe_print(json.dumps(i, ensure_ascii=False, indent=4))
def main(): args = get_args() parser = MailParser() if args.file: parser.parse_from_file(args.file) elif args.string: parser.parse_from_string(args.string) if args.json: j = json.loads(parser.parsed_mail_json) print(json.dumps(j, ensure_ascii=False, indent=4).encode('utf-8')) if args.body: print(parser.body.encode('utf-8')) if args.headers: print(parser.headers.encode('utf-8')) if args.to: print(parser.to_.encode('utf-8')) if args.from_: print(parser.from_.encode('utf-8')) if args.subject: print(parser.subject.encode('utf-8')) if args.defects: for i in parser.defects_category: print(i.encode('utf-8')) if args.anomalies: for i in parser.anomalies: print(i.encode('utf-8')) if args.senderip: r = parser.get_server_ipaddress(args.senderip) if r: print(r.encode('utf-8')) else: print("Not Found") if args.attachments: for i in parser.attachments_list: print(json.dumps(i, ensure_ascii=False, indent=4).encode('utf-8'))
def setUp(self): self.f = utils.reformat_output p = MailParser() p.parse_from_file(mail) self.mail_obj = p.parsed_mail_obj self.mail_obj['analisys_date'] = datetime.datetime.utcnow().isoformat() self.attachments = MailAttachments.withhashes(p.attachments_list) self.attachments.run() self.parameters = { 'elastic_index_mail': "spamscope_mails-", 'elastic_type_mail': "spamscope", 'elastic_index_attach': "spamscope_attachments-", 'elastic_type_attach': "spamscope" }
def setUp(self): # Init p = MailParser() s = sp.SampleParser() self.virustotal = sp.VirusTotalProcessing(api_key=API_KEY) # Parsing mail p.parse_from_file(mail) self.attachments = [] for i in p.attachments_list: s.parse_sample_from_base64( data=i['payload'], filename=i['filename'], mail_content_type=i['mail_content_type'], transfer_encoding=i['content_transfer_encoding']) self.attachments.append(s.result)
def setUp(self): # Init p = MailParser() s = sp.SampleParser() self.thug = sp.ThugProcessing(referer="http://www.google.com/", extensions=[".js"], user_agents=["win7ie90", "winxpie80"]) # Parsing mail p.parse_from_file(mail) self.attachments = [] for i in p.attachments_list: s.parse_sample_from_base64( data=i['payload'], filename=i['filename'], mail_content_type=i['mail_content_type'], transfer_encoding=i['content_transfer_encoding']) self.attachments.append(s.result)
def setUp(self): # Init p = MailParser() s = sp.SampleParser() self.tika = sp.TikaProcessing(jar="/opt/tika/tika-app-1.14.jar", valid_content_types=['application/zip'], memory_allocation=None) # Parsing mail p.parse_from_file(mail) self.attachments = [] for i in p.attachments_list: s.parse_sample_from_base64( data=i['payload'], filename=i['filename'], mail_content_type=i['mail_content_type'], transfer_encoding=i['content_transfer_encoding']) self.attachments.append(s.result)
def test_virustotal(self): # Parsing mail p = MailParser() p.parse_from_file(mail) # Init parameters s = sp.SampleParser(virustotal_enabled=True, virustotal_api_key=API_KEY) # Parsing sample for i in p.attachments_list: s.parse_sample_from_base64( data=i['payload'], filename=i['filename'], mail_content_type=i['mail_content_type'], transfer_encoding=i['content_transfer_encoding']) self.assertIn('virustotal', s.result) for j in s.result["files"]: self.assertIn('virustotal', j)
def test_complete(self): """Test with all functions enabled. """ # Init p = MailParser() s = sp.SampleParser( blacklist_content_types=[], thug_enabled=True, tika_enabled=True, virustotal_enabled=True, tika_jar="/opt/tika/tika-app-1.14.jar", tika_memory_allocation=None, tika_valid_content_types=['application/zip'], virustotal_api_key=API_KEY, thug_referer="http://www.google.com/", thug_extensions=[".js"], thug_user_agents=["winxpie80"]) # Parsing mail p.parse_from_file(mail) attachments = [] for i in p.attachments_list: s.parse_sample_from_base64( data=i['payload'], filename=i['filename'], mail_content_type=i['mail_content_type'], transfer_encoding=i['content_transfer_encoding']) attachments.append(s.result) for i in attachments: self.assertIn("tika", i) self.assertIn("virustotal", i) self.assertNotIn("thug", i) for j in i['files']: self.assertNotIn("tika", j) self.assertIn("virustotal", j) self.assertIn("thug", j)
def textExtraction(f): parser = MailParser() raw_mail = parser.parse_from_file(f) body = parser.body return body
wb = load_workbook(filename='ham.xlsx', read_only=True) ws = wb['Sheet1'] allcolumns = ws.columns hcol1v = allcolumns[0] hcol2p = allcolumns[1] ham = {} for i in list(range(len(hcol1v))): ham[hcol1v[i].value] = hcol2p[i].value parser = MailParser() parser.parse_from_file("TRAINING/TRAIN_00549.eml") body = parser.body tokens = Tokenizer.freqdata(body) PR_wi_S = 1 #for spam for keys in tokens: if keys in spam.keys(): if (float(spam.get(keys)) > 0): PR_wi_S = PR_wi_S * (float(spam.get(keys)) * tokens.get(keys)) PR_wi_H = 1
def parse_dir(dir): source_top_directory = dir files = getAllFiles(source_top_directory) dest_top_directory = "./output/" dest_directory = dest_top_directory + source_top_directory[1:] try: makedirs(dest_top_directory) print("Output top directory check: Created") except: print("Output top directory check: Exists") try: makedirs(dest_directory) print("Output inner directory check: Created") except: print("Output inner directory check: Exists") from time import sleep delay_time = 5 print("Parsing will start in", delay_time, "seconds") sleep(delay_time) parser = MailParser() success = 0 fail = 0 i = 0 for file in files: print("file to be parsed:") print(file) try: parser.parse_from_file(file) success += 1 i += 1 except: print("file cannot be parsed") output_file_path = "./PreProcessed" + file[1:] try: remove(output_file_path) except: print("already removed") fail += 1 continue mail_subject = str(parser.subject) mail_from = str(parser.from_) mail_body = str(parser.body) # print(mail_body) init_count = countWords(mail_body) mail_body = removePunc(mail_body) # print("\n\n-----Removed Punc -----\n",mail_body) try: mail_body_words = stemWords(mail_body) except: pass # print("\n\n-----Stemmed-----\n",mail_body_words) mail_body_words = removeStopWords(mail_body_words) # print("\n\n-----Removed Stop Words-----\n",mail_body_words) final_count = len(mail_body_words) # print("\nPre-Processing Done.. Reduced words count from:",init_count,"to: ",final_count) output_file_path = dest_directory + str(i) print("Parsing Complete") print(file, "parsed to output file: ", output_file_path, "\n\n") of = open(output_file_path, 'w') for word in mail_body_words: try: of.write(str(word) + " ") except: continue of.close() print("Parsed: ", success) print("Failed: ", fail)
class TestMailParser(unittest.TestCase): def setUp(self): # Init self.parser = MailParser() def test_ipaddress(self): self.parser.parse_from_file(mail_test_2) trust = "smtp.customers.net" ip = "217.76.210.112" result = self.parser.get_server_ipaddress(trust) self.assertEqual(result, ip) trust = "" result = self.parser.get_server_ipaddress(trust) self.assertEqual(result, None) trust = " " result = self.parser.get_server_ipaddress(trust) self.assertEqual(result, None) def test_fingerprints_body(self): self.parser.parse_from_file(mail_test_1) md5, sha1, sha256, sha512 = fingerprints( self.parser.body.encode("utf-8")) self.assertEqual(md5, "1bbdb7dcf511113bbc0c1b214aeac392") self.assertEqual(sha1, "ce9e62b50fa4e2168278880b14460b905b24eb4b") self.assertEqual(sha256, ("1e9b96e3f1bc74702f9703391e8ba0715b849" "7127a7ff857013ab33385898574")) self.assertEqual(sha512, ("ad858f7b5ec5549e55650fd13df7683e403489" "77522995851fb6b625ac54744cf3a4bf652784" "dba971ef99afeec4e6caf2fdd10be72eabb730" "c312ffbe1c4de3")) def test_malformed_mail(self): self.parser.parse_from_file(mail_malformed_3) defects_category = self.parser.defects_category self.assertIn("StartBoundaryNotFoundDefect", defects_category) self.assertIn("MultipartInvariantViolationDefect", defects_category) def test_type_error(self): self.parser.parse_from_file(mail_test_5) self.assertEqual(len(self.parser.attachments_list), 5) for i in self.parser.attachments_list: self.assertIsInstance(i["filename"], six.text_type) def test_valid_mail(self): with self.assertRaises(InvalidMail): self.parser.parse_from_string("fake mail") def test_valid_date_mail(self): self.parser.parse_from_file(mail_test_1), self.assertIn("mail_without_date", self.parser.anomalies) def test_parsing_know_values(self): self.parser.parse_from_file(mail_test_2) trust = "smtp.customers.net" self.assertEqual(False, self.parser.has_defects) raw = "217.76.210.112" result = self.parser.get_server_ipaddress(trust) self.assertEqual(raw, result) raw = "<*****@*****.**>" result = self.parser.message_id self.assertEqual(raw, result) raw = "mporcile@server_mail.it" result = self.parser.to_ self.assertEqual(raw, result) raw = "<*****@*****.**>" result = self.parser.from_ self.assertEqual(raw, result) raw = "Bollettino Meteorologico del 29/11/2015" result = self.parser.subject self.assertEqual(raw, result) result = self.parser.has_defects self.assertEqual(False, result) result = len(self.parser.attachments_list) self.assertEqual(3, result) raw = "Sun, 29 Nov 2015 09:45:18 +0100" raw_utc = datetime.datetime(2015, 11, 29, 8, 45, 18, 0).isoformat() result = self.parser.date_mail.isoformat() self.assertEqual(raw_utc, result) def test_types(self): self.parser.parse_from_file(mail_test_2) trust = "smtp.customers.net" self.assertEqual(False, self.parser.has_defects) result = self.parser.parsed_mail_obj self.assertIsInstance(result, dict) self.assertNotIn("defects", result) self.assertNotIn("anomalies", result) self.assertIn("has_defects", result) self.assertIn("has_anomalies", result) result = self.parser.get_server_ipaddress(trust) self.assertIsInstance(result, six.text_type) result = self.parser.parsed_mail_json self.assertIsInstance(result, six.text_type) result = self.parser.headers self.assertIsInstance(result, six.text_type) result = self.parser.body self.assertIsInstance(result, six.text_type) result = self.parser.date_mail self.assertIsInstance(result, datetime.datetime) result = self.parser.from_ self.assertIsInstance(result, six.text_type) result = self.parser.to_ self.assertIsInstance(result, six.text_type) result = self.parser.subject self.assertIsInstance(result, six.text_type) result = self.parser.message_id self.assertIsInstance(result, six.text_type) result = self.parser.attachments_list self.assertIsInstance(result, list) result = self.parser.date_mail self.assertIsInstance(result, datetime.datetime) result = self.parser.defects self.assertIsInstance(result, list) result = self.parser.anomalies self.assertIsInstance(result, list) def test_defects_anomalies(self): self.parser.parse_from_file(mail_malformed_1) self.assertEqual(True, self.parser.has_defects) self.assertEqual(1, len(self.parser.defects)) self.assertEqual(1, len(self.parser.defects_category)) self.assertIn("defects", self.parser.parsed_mail_obj) self.assertIn("StartBoundaryNotFoundDefect", self.parser.defects_category) self.assertIsInstance(self.parser.parsed_mail_json, six.text_type) result = len(self.parser.attachments_list) self.assertEqual(1, result) self.parser.parse_from_file(mail_test_1) if six.PY2: self.assertEqual(False, self.parser.has_defects) self.assertNotIn("defects", self.parser.parsed_mail_obj) elif six.PY3: self.assertEqual(True, self.parser.has_defects) self.assertEqual(1, len(self.parser.defects)) self.assertEqual(1, len(self.parser.defects_category)) self.assertIn("defects", self.parser.parsed_mail_obj) self.assertIn( "CloseBoundaryNotFoundDefect", self.parser.defects_category) self.assertEqual(True, self.parser.has_anomalies) self.assertEqual(2, len(self.parser.anomalies)) self.assertIn("anomalies", self.parser.parsed_mail_obj) self.assertIn("has_anomalies", self.parser.parsed_mail_obj) def test_defects_bug(self): self.parser.parse_from_file(mail_malformed_2) self.assertEqual(True, self.parser.has_defects) self.assertEqual(1, len(self.parser.defects)) self.assertEqual(1, len(self.parser.defects_category)) self.assertIn("defects", self.parser.parsed_mail_obj) self.assertIn("StartBoundaryNotFoundDefect", self.parser.defects_category) self.assertIsInstance(self.parser.parsed_mail_json, six.text_type) result = len(self.parser.attachments_list) self.assertEqual(0, result) def test_add_content_type(self): self.parser.parse_from_file(mail_test_3) self.assertEqual(False, self.parser.has_defects) result = self.parser.parsed_mail_obj self.assertEqual(len(result["attachments"]), 1) self.assertIsInstance( result["attachments"][0]["mail_content_type"], six.text_type) self.assertIsInstance( result["attachments"][0]["payload"], six.text_type) self.assertEqual( result["attachments"][0]["content_transfer_encoding"], "quoted-printable")
no_of_spam=0 no_of_ham=0 i = 0 for lab in label: a = str(i) if(i<10): a = "00"+a elif(i < 100): a = "0" +a else: a = a; f = "TRAINING/TRAIN_00" +a+ ".eml" print(f) parser.parse_from_file(f) print(lab) if(lab=='0'): spam += parser.body no_of_spam += 1 elif(lab=='1'): ham += parser.body no_of_ham += 1 i=i+1 ferqofspam = Tokenizer.freqdata(spam) print(ferqofspam) print(no_of_spam)