예제 #1
0
    def setUp(self):
        self.f = utils.reformat_output

        p = MailParser()
        p.parse_from_file(mail)
        self.mail_obj = p.parsed_mail_obj
        self.mail_obj['analisys_date'] = datetime.datetime.utcnow().isoformat()

        s = sp.SampleParser(
            tika_enabled=True,
            tika_jar="/opt/tika/tika-app-1.14.jar",
            tika_memory_allocation=None,
            tika_valid_content_types=['application/zip'])

        self.attachments = []

        for i in p.attachments_list:
            s.parse_sample_from_base64(
                data=i['payload'],
                filename=i['filename'],
                mail_content_type=i['mail_content_type'],
                transfer_encoding=i['content_transfer_encoding'])
            self.attachments.append(s.result)

        self.parameters = {
            'elastic_index_mail': "spamscope_mails-",
            'elastic_type_mail': "spamscope",
            'elastic_index_attach': "spamscope_attachments-",
            'elastic_type_attach': "spamscope"}
예제 #2
0
    def test_meta_data(self):
        """Test meta data analysis."""

        # Parsing mail
        p = MailParser()
        p.parse_from_file(mail)

        # Init parameters
        new_attachments = []
        s = sp.SampleParser()
        t = sp.TikaAnalysis(jar="/opt/tika/tika-app-1.13.jar")

        # Parsing sample
        for i in p.attachments_list:
            s.parse_sample_from_base64(
                data=i['payload'],
                filename=i['filename'],
                mail_content_type=i['mail_content_type'],
                transfer_encoding=i['content_transfer_encoding'])

            if s.result:
                new_attachments.append(s.result)

        # Tika analysis
        for i in new_attachments:
            t.add_meta_data(i)
            self.assertNotIn('tika', i)

        t = sp.TikaAnalysis(jar="/opt/tika/tika-app-1.13.jar",
                            valid_content_types=["application/zip"])

        for i in new_attachments:
            t.add_meta_data(i)
            self.assertIn('tika', i)
예제 #3
0
    def test_add_analysis(self):
        """Test add VirusTotal analysis."""

        # Parsing mail
        p = MailParser()
        p.parse_from_file(mail)

        # Init parameters
        new_attachments = []
        s = sp.SampleParser()
        v = sp.VirusTotalAnalysis(api_key=API_KEY)

        # Parsing sample
        for i in p.attachments_list:
            s.parse_sample_from_base64(
                data=i['payload'],
                filename=i['filename'],
                mail_content_type=i['mail_content_type'],
                transfer_encoding=i['content_transfer_encoding'])

            if s.result:
                new_attachments.append(s.result)

        # VirusTotal analysis
        for i in new_attachments:
            v.add_analysis(i)
            self.assertIn('virustotal', i)

            for j in i["files"]:
                self.assertIn('virustotal', j)
예제 #4
0
    def setUp(self):

        # Init

        p = MailParser()
        p.parse_from_file(mail)
        self.attachments = p.attachments_list

        p.parse_from_file(mail_thug)
        self.attachments_thug = p.attachments_list
예제 #5
0
def main():
    args = get_args()

    parser = MailParser()

    if args.file:
        parser.parse_from_file(args.file)
    elif args.string:
        parser.parse_from_string(args.string)

    if args.json:
        j = json.loads(parser.parsed_mail_json)
        safe_print(json.dumps(j, ensure_ascii=False, indent=4))

    if args.body:
        # safe_print(parser.body)
        safe_print(parser.body)

    if args.headers:
        safe_print(parser.headers)

    if args.to:
        safe_print(parser.to_)

    if args.from_:
        safe_print(parser.from_)

    if args.subject:
        safe_print(parser.subject)

    if args.defects:
        for i in parser.defects_category:
            safe_print(i)

    if args.anomalies:
        for i in parser.anomalies:
            safe_print(i)

    if args.senderip:
        r = parser.get_server_ipaddress(args.senderip)
        if r:
            safe_print(r)
        else:
            safe_print("Not Found")

    if args.attachments:
        for i in parser.attachments_list:
            safe_print(json.dumps(i, ensure_ascii=False, indent=4))
예제 #6
0
def main():
    args = get_args()

    parser = MailParser()

    if args.file:
        parser.parse_from_file(args.file)
    elif args.string:
        parser.parse_from_string(args.string)

    if args.json:
        j = json.loads(parser.parsed_mail_json)
        print(json.dumps(j, ensure_ascii=False, indent=4).encode('utf-8'))

    if args.body:
        print(parser.body.encode('utf-8'))

    if args.headers:
        print(parser.headers.encode('utf-8'))

    if args.to:
        print(parser.to_.encode('utf-8'))

    if args.from_:
        print(parser.from_.encode('utf-8'))

    if args.subject:
        print(parser.subject.encode('utf-8'))

    if args.defects:
        for i in parser.defects_category:
            print(i.encode('utf-8'))

    if args.anomalies:
        for i in parser.anomalies:
            print(i.encode('utf-8'))

    if args.senderip:
        r = parser.get_server_ipaddress(args.senderip)
        if r:
            print(r.encode('utf-8'))
        else:
            print("Not Found")

    if args.attachments:
        for i in parser.attachments_list:
            print(json.dumps(i, ensure_ascii=False, indent=4).encode('utf-8'))
예제 #7
0
    def setUp(self):
        self.f = utils.reformat_output

        p = MailParser()
        p.parse_from_file(mail)
        self.mail_obj = p.parsed_mail_obj
        self.mail_obj['analisys_date'] = datetime.datetime.utcnow().isoformat()

        self.attachments = MailAttachments.withhashes(p.attachments_list)
        self.attachments.run()

        self.parameters = {
            'elastic_index_mail': "spamscope_mails-",
            'elastic_type_mail': "spamscope",
            'elastic_index_attach': "spamscope_attachments-",
            'elastic_type_attach': "spamscope"
        }
    def setUp(self):

        # Init
        p = MailParser()
        s = sp.SampleParser()
        self.virustotal = sp.VirusTotalProcessing(api_key=API_KEY)

        # Parsing mail
        p.parse_from_file(mail)
        self.attachments = []

        for i in p.attachments_list:
            s.parse_sample_from_base64(
                data=i['payload'],
                filename=i['filename'],
                mail_content_type=i['mail_content_type'],
                transfer_encoding=i['content_transfer_encoding'])
            self.attachments.append(s.result)
예제 #9
0
    def setUp(self):

        # Init
        p = MailParser()
        s = sp.SampleParser()
        self.thug = sp.ThugProcessing(referer="http://www.google.com/",
                                      extensions=[".js"],
                                      user_agents=["win7ie90", "winxpie80"])

        # Parsing mail
        p.parse_from_file(mail)
        self.attachments = []

        for i in p.attachments_list:
            s.parse_sample_from_base64(
                data=i['payload'],
                filename=i['filename'],
                mail_content_type=i['mail_content_type'],
                transfer_encoding=i['content_transfer_encoding'])
            self.attachments.append(s.result)
    def setUp(self):

        # Init
        p = MailParser()
        s = sp.SampleParser()
        self.tika = sp.TikaProcessing(jar="/opt/tika/tika-app-1.14.jar",
                                      valid_content_types=['application/zip'],
                                      memory_allocation=None)

        # Parsing mail
        p.parse_from_file(mail)
        self.attachments = []

        for i in p.attachments_list:
            s.parse_sample_from_base64(
                data=i['payload'],
                filename=i['filename'],
                mail_content_type=i['mail_content_type'],
                transfer_encoding=i['content_transfer_encoding'])
            self.attachments.append(s.result)
예제 #11
0
    def test_virustotal(self):
        # Parsing mail
        p = MailParser()
        p.parse_from_file(mail)

        # Init parameters
        s = sp.SampleParser(virustotal_enabled=True,
                            virustotal_api_key=API_KEY)

        # Parsing sample
        for i in p.attachments_list:
            s.parse_sample_from_base64(
                data=i['payload'],
                filename=i['filename'],
                mail_content_type=i['mail_content_type'],
                transfer_encoding=i['content_transfer_encoding'])

            self.assertIn('virustotal', s.result)

            for j in s.result["files"]:
                self.assertIn('virustotal', j)
예제 #12
0
    def test_complete(self):
        """Test with all functions enabled. """

        # Init
        p = MailParser()
        s = sp.SampleParser(
            blacklist_content_types=[],
            thug_enabled=True,
            tika_enabled=True,
            virustotal_enabled=True,
            tika_jar="/opt/tika/tika-app-1.14.jar",
            tika_memory_allocation=None,
            tika_valid_content_types=['application/zip'],
            virustotal_api_key=API_KEY,
            thug_referer="http://www.google.com/",
            thug_extensions=[".js"],
            thug_user_agents=["winxpie80"])

        # Parsing mail
        p.parse_from_file(mail)
        attachments = []

        for i in p.attachments_list:
            s.parse_sample_from_base64(
                data=i['payload'],
                filename=i['filename'],
                mail_content_type=i['mail_content_type'],
                transfer_encoding=i['content_transfer_encoding'])
            attachments.append(s.result)

        for i in attachments:
            self.assertIn("tika", i)
            self.assertIn("virustotal", i)
            self.assertNotIn("thug", i)

            for j in i['files']:
                self.assertNotIn("tika", j)
                self.assertIn("virustotal", j)
                self.assertIn("thug", j)
예제 #13
0
def textExtraction(f):
    parser = MailParser()
    raw_mail = parser.parse_from_file(f)
    body = parser.body
    return body
예제 #14
0
wb = load_workbook(filename='ham.xlsx', read_only=True)
ws = wb['Sheet1']

allcolumns = ws.columns

hcol1v = allcolumns[0]
hcol2p = allcolumns[1]

ham = {}

for i in list(range(len(hcol1v))):
    ham[hcol1v[i].value] = hcol2p[i].value

parser = MailParser()
parser.parse_from_file("TRAINING/TRAIN_00549.eml")

body = parser.body

tokens = Tokenizer.freqdata(body)

PR_wi_S = 1

#for spam
for keys in tokens:
    if keys in spam.keys():
        if (float(spam.get(keys)) > 0):
            PR_wi_S = PR_wi_S * (float(spam.get(keys)) * tokens.get(keys))

PR_wi_H = 1
예제 #15
0
def parse_dir(dir):
    source_top_directory = dir
    files = getAllFiles(source_top_directory)

    dest_top_directory = "./output/"
    dest_directory = dest_top_directory + source_top_directory[1:]

    try:
        makedirs(dest_top_directory)
        print("Output top directory check: Created")
    except:
        print("Output top directory check: Exists")
    try:
        makedirs(dest_directory)
        print("Output inner directory check: Created")
    except:
        print("Output inner directory check: Exists")
    from time import sleep

    delay_time = 5
    print("Parsing will start in", delay_time, "seconds")
    sleep(delay_time)

    parser = MailParser()
    success = 0
    fail = 0
    i = 0
    for file in files:
        print("file to be parsed:")
        print(file)
        try:
            parser.parse_from_file(file)
            success += 1
            i += 1
        except:
            print("file cannot be parsed")
            output_file_path = "./PreProcessed" + file[1:]
            try:
                remove(output_file_path)
            except:
                print("already removed")
            fail += 1
            continue

        mail_subject = str(parser.subject)
        mail_from = str(parser.from_)
        mail_body = str(parser.body)
        # print(mail_body)

        init_count = countWords(mail_body)

        mail_body = removePunc(mail_body)
        # print("\n\n-----Removed Punc -----\n",mail_body)
        try:
            mail_body_words = stemWords(mail_body)
        except:
            pass
        # print("\n\n-----Stemmed-----\n",mail_body_words)

        mail_body_words = removeStopWords(mail_body_words)
        # print("\n\n-----Removed Stop Words-----\n",mail_body_words)

        final_count = len(mail_body_words)
        # print("\nPre-Processing Done.. Reduced words count from:",init_count,"to: ",final_count)

        output_file_path = dest_directory + str(i)
        print("Parsing Complete")
        print(file, "parsed to output file: ", output_file_path, "\n\n")
        of = open(output_file_path, 'w')
        for word in mail_body_words:
            try:
                of.write(str(word) + " ")
            except:
                continue
        of.close()

    print("Parsed: ", success)
    print("Failed: ", fail)
예제 #16
0
class TestMailParser(unittest.TestCase):

    def setUp(self):
        # Init
        self.parser = MailParser()

    def test_ipaddress(self):
        self.parser.parse_from_file(mail_test_2)
        trust = "smtp.customers.net"

        ip = "217.76.210.112"
        result = self.parser.get_server_ipaddress(trust)
        self.assertEqual(result, ip)

        trust = ""
        result = self.parser.get_server_ipaddress(trust)
        self.assertEqual(result, None)

        trust = "   "
        result = self.parser.get_server_ipaddress(trust)
        self.assertEqual(result, None)

    def test_fingerprints_body(self):
        self.parser.parse_from_file(mail_test_1)
        md5, sha1, sha256, sha512 = fingerprints(
            self.parser.body.encode("utf-8"))
        self.assertEqual(md5, "1bbdb7dcf511113bbc0c1b214aeac392")
        self.assertEqual(sha1, "ce9e62b50fa4e2168278880b14460b905b24eb4b")
        self.assertEqual(sha256, ("1e9b96e3f1bc74702f9703391e8ba0715b849"
                                  "7127a7ff857013ab33385898574"))
        self.assertEqual(sha512, ("ad858f7b5ec5549e55650fd13df7683e403489"
                                  "77522995851fb6b625ac54744cf3a4bf652784"
                                  "dba971ef99afeec4e6caf2fdd10be72eabb730"
                                  "c312ffbe1c4de3"))

    def test_malformed_mail(self):
        self.parser.parse_from_file(mail_malformed_3)
        defects_category = self.parser.defects_category
        self.assertIn("StartBoundaryNotFoundDefect", defects_category)
        self.assertIn("MultipartInvariantViolationDefect", defects_category)

    def test_type_error(self):
        self.parser.parse_from_file(mail_test_5)
        self.assertEqual(len(self.parser.attachments_list), 5)
        for i in self.parser.attachments_list:
            self.assertIsInstance(i["filename"], six.text_type)

    def test_valid_mail(self):
        with self.assertRaises(InvalidMail):
            self.parser.parse_from_string("fake mail")

    def test_valid_date_mail(self):
        self.parser.parse_from_file(mail_test_1),
        self.assertIn("mail_without_date", self.parser.anomalies)

    def test_parsing_know_values(self):
        self.parser.parse_from_file(mail_test_2)
        trust = "smtp.customers.net"

        self.assertEqual(False, self.parser.has_defects)

        raw = "217.76.210.112"
        result = self.parser.get_server_ipaddress(trust)
        self.assertEqual(raw, result)

        raw = "<*****@*****.**>"
        result = self.parser.message_id
        self.assertEqual(raw, result)

        raw = "mporcile@server_mail.it"
        result = self.parser.to_
        self.assertEqual(raw, result)

        raw = "<*****@*****.**>"
        result = self.parser.from_
        self.assertEqual(raw, result)

        raw = "Bollettino Meteorologico del 29/11/2015"
        result = self.parser.subject
        self.assertEqual(raw, result)

        result = self.parser.has_defects
        self.assertEqual(False, result)

        result = len(self.parser.attachments_list)
        self.assertEqual(3, result)

        raw = "Sun, 29 Nov 2015 09:45:18 +0100"
        raw_utc = datetime.datetime(2015, 11, 29, 8, 45, 18, 0).isoformat()
        result = self.parser.date_mail.isoformat()
        self.assertEqual(raw_utc, result)

    def test_types(self):
        self.parser.parse_from_file(mail_test_2)
        trust = "smtp.customers.net"

        self.assertEqual(False, self.parser.has_defects)

        result = self.parser.parsed_mail_obj
        self.assertIsInstance(result, dict)
        self.assertNotIn("defects", result)
        self.assertNotIn("anomalies", result)
        self.assertIn("has_defects", result)
        self.assertIn("has_anomalies", result)

        result = self.parser.get_server_ipaddress(trust)
        self.assertIsInstance(result, six.text_type)

        result = self.parser.parsed_mail_json
        self.assertIsInstance(result, six.text_type)

        result = self.parser.headers
        self.assertIsInstance(result, six.text_type)

        result = self.parser.body
        self.assertIsInstance(result, six.text_type)

        result = self.parser.date_mail
        self.assertIsInstance(result, datetime.datetime)

        result = self.parser.from_
        self.assertIsInstance(result, six.text_type)

        result = self.parser.to_
        self.assertIsInstance(result, six.text_type)

        result = self.parser.subject
        self.assertIsInstance(result, six.text_type)

        result = self.parser.message_id
        self.assertIsInstance(result, six.text_type)

        result = self.parser.attachments_list
        self.assertIsInstance(result, list)

        result = self.parser.date_mail
        self.assertIsInstance(result, datetime.datetime)

        result = self.parser.defects
        self.assertIsInstance(result, list)

        result = self.parser.anomalies
        self.assertIsInstance(result, list)

    def test_defects_anomalies(self):
        self.parser.parse_from_file(mail_malformed_1)

        self.assertEqual(True, self.parser.has_defects)
        self.assertEqual(1, len(self.parser.defects))
        self.assertEqual(1, len(self.parser.defects_category))
        self.assertIn("defects", self.parser.parsed_mail_obj)
        self.assertIn("StartBoundaryNotFoundDefect",
                      self.parser.defects_category)
        self.assertIsInstance(self.parser.parsed_mail_json, six.text_type)

        result = len(self.parser.attachments_list)
        self.assertEqual(1, result)

        self.parser.parse_from_file(mail_test_1)
        if six.PY2:
            self.assertEqual(False, self.parser.has_defects)
            self.assertNotIn("defects", self.parser.parsed_mail_obj)
        elif six.PY3:
            self.assertEqual(True, self.parser.has_defects)
            self.assertEqual(1, len(self.parser.defects))
            self.assertEqual(1, len(self.parser.defects_category))
            self.assertIn("defects", self.parser.parsed_mail_obj)
            self.assertIn(
                "CloseBoundaryNotFoundDefect", self.parser.defects_category)

        self.assertEqual(True, self.parser.has_anomalies)
        self.assertEqual(2, len(self.parser.anomalies))
        self.assertIn("anomalies", self.parser.parsed_mail_obj)
        self.assertIn("has_anomalies", self.parser.parsed_mail_obj)

    def test_defects_bug(self):
        self.parser.parse_from_file(mail_malformed_2)

        self.assertEqual(True, self.parser.has_defects)
        self.assertEqual(1, len(self.parser.defects))
        self.assertEqual(1, len(self.parser.defects_category))
        self.assertIn("defects", self.parser.parsed_mail_obj)
        self.assertIn("StartBoundaryNotFoundDefect",
                      self.parser.defects_category)
        self.assertIsInstance(self.parser.parsed_mail_json, six.text_type)

        result = len(self.parser.attachments_list)
        self.assertEqual(0, result)

    def test_add_content_type(self):
        self.parser.parse_from_file(mail_test_3)

        self.assertEqual(False, self.parser.has_defects)

        result = self.parser.parsed_mail_obj

        self.assertEqual(len(result["attachments"]), 1)
        self.assertIsInstance(
            result["attachments"][0]["mail_content_type"], six.text_type)
        self.assertIsInstance(
            result["attachments"][0]["payload"], six.text_type)
        self.assertEqual(
            result["attachments"][0]["content_transfer_encoding"],
            "quoted-printable")
예제 #17
0
no_of_spam=0
no_of_ham=0

i = 0

for lab in label:
    a = str(i)
    if(i<10):
        a = "00"+a
    elif(i < 100):
        a = "0" +a
    else:
        a = a;
    f = "TRAINING/TRAIN_00" +a+ ".eml"
    print(f)
    parser.parse_from_file(f)
    print(lab)
    if(lab=='0'):
        spam += parser.body
        no_of_spam += 1
    elif(lab=='1'):
        ham += parser.body
        no_of_ham += 1

    i=i+1


ferqofspam = Tokenizer.freqdata(spam)
print(ferqofspam)

print(no_of_spam)