def do_eml(eml,out_txt): # print("Parsing email (%d bytes)"%(len(eml))) vtokens=[] tokens=[] for text in eml2str(eml): # print(text.encode("utf-8")) if text.find('pam detection software, running on the system')>=0: continue t=" ".join(text.replace('"',' ').split()) # print(t.encode("utf-8")) if(len(t)>10): # print(str(label)+" "+t.encode("utf-8")) # print(t.encode("utf-8")) try: vtok,tok=tokenize(t,wordmap) if len(vtok)>len(vtokens): vtokens=vtok tokens=tok except: eprint(traceback.format_exc()) # print("NUM of tokens: %d / %d"%(len(vtokens),len(tokens))) # print("%d / %d"%(bestnn,len(tokens))) if len(vtokens)<10: return 0 # ok=dedup(vtokens,5,(len(vtokens)-10)/3) ok=dedup(vtokens,7,(len(vtokens)-10)*4/5) # ok=dedup1(vtokens) # print(ok) if ok: # print(" ".join(tokens)+"\n") out_txt.write(" ".join(tokens)+"\n") return ok
def do_eml(msg): # # jobb ha bytes-ban kapja meg a raw levelet, mert az utf8 karakterek kulonben elcseszodhetnek! pl. sql_0000022480.eml ahol keverve van htmlentity es utf8 text/plain-ben! # if type(eml)==bytes: # msg = email.message_from_bytes(eml) # else: # msg = email.message_from_string(eml) # try: # print len(eml) vtokens = [] tokens = [] for text in eml2str(msg): if text.find('pam detection software, running on the system') >= 0: continue t = " ".join(text.replace('"', ' ').split()) # print(t.encode("utf-8")) if (len(t) > 10): # print(str(label)+" "+t.encode("utf-8")) try: vtok, tok = tokenize(t, wordmap) if len(vtok) > len(vtokens): vtokens = vtok tokens = tok except: eprint(traceback.format_exc()) print("NUM of tokens: %d / %d" % (len(vtokens), len(tokens))) if len(vtokens) < 10: return "toosmall" print(" ".join(vtokens)) res = deepspam_test(vtokens) res += 0.1 print(res) # print("%d%%"%(res)) try: f = open("deepspam.res", "at") f.write("%3d%%:" % (res) + " ".join(tokens) + "\n") f.close() except: pass if res < 2: return "ham %d%%" % (res) if res < 10: return "maybeham %d%%" % (res) if res < 20: return "20ham %d%%" % (res) if res > 98: return "spam %d%%" % (res) if res > 90: return "maybespam %d%%" % (res) if res > 80: return "80spam %d%%" % (res) return "dunno %d%%" % (res)
def header(self, name, val): if name == "X-Grey-ng" and val[0:6] == "REJECT": self.reject = 1 if self.fp: try: self.fp.write( ("%s: %s\n" % (name, val)).encode() ) # python2, sima utf8. py2 alatt elvileg tamogatott a surrogate is, de azt ugyis csak az unreleased pymilter tudja except: try: self.fp.write( ("%s: %s\n" % (name, val)).encode( encoding='ascii', errors='surrogateescape') ) # python3, speci (surrogate escaped) utf8 ami 8 bites asciit tarol except: eprint( "DEEPSPAM: Exception at header(%s) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" % (name)) eprint(traceback.format_exc()) return Milter.CONTINUE
def eom(self): if not self.fp: return Milter.ACCEPT try: self.fp.seek(0) print("PARSING %d body chars" % self.bodysize) try: msg = email.message_from_binary_file(self.fp) # python 3.2+ except: msg = email.message_from_file(self.fp) # python2 res = do_eml(msg) print("X-deepspam: " + res) self.addheader("X-deepspam", res) except: eprint( "DEEPSPAM: Exception at eom() !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ) eprint(traceback.format_exc()) # if self.reject: # self.setreply('541','5.7.1','Sorry, your address is already blocked for sending spam/virus, contact postmaster') # return Milter.REJECT return Milter.ACCEPT # ACCEPT modified message
def do_eml(eml): vtokens=[] tokens=[] for text in eml2str(eml): if text.find('pam detection software, running on the system')>=0: continue t=" ".join(text.replace('"',' ').split()) if(len(t)>10): # print(str(label)+" "+t.encode("utf-8")) # print(t.encode("utf-8")) try: vtok,tok=tokenize(t,wordmap) if len(vtok)>len(vtokens): vtokens=vtok tokens=tok except: eprint(traceback.format_exc()) print("NUM of tokens: %d / %d"%(len(vtokens),len(tokens))) if len(vtokens)<10: return "toosmall" print(" ".join(vtokens)) res=deepspam_test(vtokens) res+=0.1 print(res) try: f=open("test_res","at") f.write("%3d%%:"%(res)+" ".join(tokens)+"\n") f.close() except: pass if res<=10: return "ham-%d"%(res) if res<20: return "ham-maybe" if res>=90: return "spam-%d"%(res) if res>80: return "spam-maybe" return "dunno"