def _digestAuthentication(self, login, method): """Performs a Digest HTTP authentication. Returns True when the user has logged in successfully, False otherwise.""" def stripQuotes(s): return (s[0] == '"' and s[-1] == '"') and s[1:-1] or s options = dict(self._login_splitter.findall(login)) userName = stripQuotes(options["username"]) password = self._server.getPasswordForUser(userName) nonce = stripQuotes(options["nonce"]) A1 = "%s:%s:%s" % (userName, self._server.getRealm(), password) HA1 = md5(A1).hexdigest() A2 = "%s:%s" % (method, stripQuotes(options["uri"])) HA2 = md5(A2).hexdigest() unhashedDigest = "" if "qop" in options: if not options["nc"]: options["nc"] = "00000001" if not options["qop"]: options["qop"] = "auth" unhashedDigest = "%s:%s:%s:%s:%s:%s" % \ (HA1, nonce, stripQuotes(options["nc"]), stripQuotes(options["cnonce"]), stripQuotes(options["qop"]), HA2) else: unhashedDigest = "%s:%s:%s" % (HA1, nonce, HA2) hashedDigest = md5(unhashedDigest).hexdigest() return (stripQuotes(options["response"]) == hashedDigest and self._isValidNonce(nonce))
def structure(self, ext=False): """Body structure data describes the MIME-IMB format of a message and consists of a sequence of mime type, mime subtype, parameters, content id, description, encoding, and size. The fields following the size field are variable: if the mime type/subtype is message/rfc822, the contained message's envelope information, body structure data, and number of lines of text; if the mime type is text, the number of lines of text. Extension fields may also be included; if present, they are: the MD5 hash of the body, body disposition, body language.""" s = [] for part in self.walk(): if part.get_content_charset() is not None: charset = ("charset", part.get_content_charset()) else: charset = None part_s = [part.get_main_type(), part.get_subtype(), charset, part.get('Content-Id'), part.get('Content-Description'), part.get('Content-Transfer-Encoding'), str(len(part.as_string()))] if part.get_main_type() == "text": part_s.append(str(part.as_string().count("\n"))) if ext: part_s.extend([md5(part.as_string()).digest(), part.get('Content-Disposition'), part.get('Content-Language')]) s.append(part_s) if len(s) == 1: return s[0] return s
def generate_checksum(msg): data = flatten(msg) # modelled after Justin Mason's fuzzy checksummer for SpamAssassin. # Message body is cleaned, then broken into lines. The list of lines is # then broken into four parts and separate checksums are generated for # each part. They are then joined together with '.'. Downstream # processes can split those chunks into pieces and consider them # separately or in various combinations if desired. # Get rid of anything which looks like an HTML tag and downcase it all data = re.sub(r"<[^>]*>", "", data).lower() # delete anything which looks like a url or email address # not sure what a pmguid: url is but it seems to occur frequently in spam words = [w for w in data.split(' ') if ('@' not in w and (':' not in w or w[:4] != "ftp:" and w[:7] != "mailto:" and w[:5] != "http:" and w[:7] != "gopher:" and w[:8] != "pmguid:"))] # delete lines which contain white space lines = [line for line in " ".join(words).split('\n') if ' ' in line] # +1 guarantees we don't miss lines at the end chunksize = len(lines)//4+1 sum = [] for i in range(4): chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize]) sum.append(binascii.b2a_hex(md5(chunk).digest())) return ".".join(sum)
def generate_checksum(msg): fp = io.StringIO() g = email.generator.Generator(fp, mangle_from_=False, maxheaderlen=60) g.flatten(msg) text = fp.getvalue() body = text.split("\n\n", 1)[1] lines = clean(body).split("\n") chunksize = len(lines)//4+1 digest = [] for i in range(4): chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize]) digest.append(md5(chunk).hexdigest()) return ".".join(digest)
def generate_checksum(msg): data = flatten(msg) data = re.sub(r"<[^>]*>", "", data).lower() words = [w for w in data.split(' ') if ('@' not in w and (':' not in w or w[:4] != "ftp:" and w[:7] != "mailto:" and w[:5] != "http:" and w[:7] != "gopher:" and w[:8] != "pmguid:"))] lines = [line for line in " ".join(words).split('\n') if ' ' in line] chunksize = len(lines)//4+1 sum = [] for i in range(4): chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize]) sum.append(binascii.b2a_hex(md5(chunk).digest())) return ".".join(sum)
def extract_ocr_info(self, pnmfiles): assert self.engine, "must have an engine!" textbits = [] tokens = set() for pnmfile in pnmfiles: preserve = False fhash = md5(open(pnmfile).read()).hexdigest() if fhash in self.cache: self.hits += 1 ctext, ctokens = self.cache[fhash] else: self.misses += 1 if self.engine.program: try: ctext = self.engine.extract_text(pnmfile).lower() except SystemError, msg: print >> sys.stderr, msg preserve = True ctext = "" else: print >> sys.stderr, \ "No OCR program '%s' available - can't get text!" \ % (self.engine.engine_name,) ctext = "" ctokens = set() if not ctext.strip(): ctokens.add("image-text:no text found") else: nlines = len(ctext.strip().split("\n")) if nlines: ctokens.add("image-text-lines:%d" % int(log2(nlines))) self.cache[fhash] = (ctext, ctokens)
def _digestAuthentication(self, login, method): """Performs a Digest HTTP authentication. Returns True when the user has logged in successfully, False otherwise.""" def stripQuotes(s): return (s[0] == '"' and s[-1] == '"') and s[1:-1] or s options = dict(self._login_splitter.findall(login)) userName = stripQuotes(options["username"]) password = self._server.getPasswordForUser(userName) nonce = stripQuotes(options["nonce"]) # The following computations are based upon RFC 2617. A1 = "%s:%s:%s" % (userName, self._server.getRealm(), password) HA1 = md5(A1).hexdigest() A2 = "%s:%s" % (method, stripQuotes(options["uri"])) HA2 = md5(A2).hexdigest() unhashedDigest = "" if options.has_key("qop"): # IE 6.0 doesn't give nc back correctly? if not options["nc"]: options["nc"] = "00000001" # Firefox 1.0 doesn't give qop back correctly? if not options["qop"]: options["qop"] = "auth" unhashedDigest = "%s:%s:%s:%s:%s:%s" % ( HA1, nonce, stripQuotes(options["nc"]), stripQuotes(options["cnonce"]), stripQuotes(options["qop"]), HA2, ) else: unhashedDigest = "%s:%s:%s" % (HA1, nonce, HA2) hashedDigest = md5(unhashedDigest).hexdigest() return stripQuotes(options["response"]) == hashedDigest and self._isValidNonce(nonce)
def _digestAuthentication(self, login, method): """Performs a Digest HTTP authentication. Returns True when the user has logged in successfully, False otherwise.""" def stripQuotes(s): return (s[0] == '"' and s[-1] == '"') and s[1:-1] or s options = dict(self._login_splitter.findall(login)) userName = stripQuotes(options["username"]) password = self._server.getPasswordForUser(userName) nonce = stripQuotes(options["nonce"]) # The following computations are based upon RFC 2617. A1 = "%s:%s:%s" % (userName, self._server.getRealm(), password) HA1 = md5(A1).hexdigest() A2 = "%s:%s" % (method, stripQuotes(options["uri"])) HA2 = md5(A2).hexdigest() unhashedDigest = "" if options.has_key("qop"): # IE 6.0 doesn't give nc back correctly? if not options["nc"]: options["nc"] = "00000001" # Firefox 1.0 doesn't give qop back correctly? if not options["qop"]: options["qop"] = "auth" unhashedDigest = "%s:%s:%s:%s:%s:%s" % \ (HA1, nonce, stripQuotes(options["nc"]), stripQuotes(options["cnonce"]), stripQuotes(options["qop"]), HA2) else: unhashedDigest = "%s:%s:%s" % (HA1, nonce, HA2) hashedDigest = md5(unhashedDigest).hexdigest() return (stripQuotes(options["response"]) == hashedDigest and self._isValidNonce(nonce))
def extract_ocr_info(self, pnmfiles): assert self.engine, "must have an engine!" textbits = [] tokens = set() for pnmfile in pnmfiles: preserve = False fhash = md5(open(pnmfile).read()).hexdigest() if fhash in self.cache: self.hits += 1 ctext, ctokens = self.cache[fhash] else: self.misses += 1 if self.engine.program: try: ctext = self.engine.extract_text(pnmfile).lower() except SystemError as msg: print(msg, file=sys.stderr) preserve = True ctext = "" else: # We should not get here if no OCR is enabled. If it # is enabled and we have no program, its OK to spew lots # of warnings - they should either disable OCR (it is by # default), or fix their config. print("No OCR program '%s' available - can't get text!" \ % (self.engine.engine_name,), file=sys.stderr) ctext = "" ctokens = set() if not ctext.strip(): # Lots of spam now contains images in which it is # difficult or impossible (using ocrad) to find any # text. Make a note of that. ctokens.add("image-text:no text found") else: nlines = len(ctext.strip().split("\n")) if nlines: ctokens.add("image-text-lines:%d" % int(log2(nlines))) self.cache[fhash] = (ctext, ctokens) textbits.append(ctext) tokens |= ctokens if not preserve: os.unlink(pnmfile) return "\n".join(textbits), tokens
def generate_checksum(msg): # modelled after Justin Mason's fuzzy checksummer for SpamAssassin. # Message body is cleaned, then broken into lines. The list of lines is # then broken into four parts and separate checksums are generated for # each part. They are then joined together with '.'. Downstream # processes can split those chunks into pieces and consider them # separately or in various combinations if desired. fp = StringIO.StringIO() g = email.generator.Generator(fp, mangle_from_=False, maxheaderlen=60) g.flatten(msg) text = fp.getvalue() body = text.split("\n\n", 1)[1] lines = clean(body).split("\n") chunksize = len(lines) // 4 + 1 digest = [] for i in range(4): chunk = "\n".join(lines[i * chunksize:(i + 1) * chunksize]) digest.append(md5(chunk).hexdigest()) return ".".join(digest)
def generate_checksum(msg): # modelled after Justin Mason's fuzzy checksummer for SpamAssassin. # Message body is cleaned, then broken into lines. The list of lines is # then broken into four parts and separate checksums are generated for # each part. They are then joined together with '.'. Downstream # processes can split those chunks into pieces and consider them # separately or in various combinations if desired. fp = StringIO.StringIO() g = email.generator.Generator(fp, mangle_from_=False, maxheaderlen=60) g.flatten(msg) text = fp.getvalue() body = text.split("\n\n", 1)[1] lines = clean(body).split("\n") chunksize = len(lines)//4+1 digest = [] for i in range(4): chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize]) digest.append(md5(chunk).hexdigest()) return ".".join(digest)
def structure(self, ext=False): """Body structure data describes the MIME-IMB format of a message and consists of a sequence of mime type, mime subtype, parameters, content id, description, encoding, and size. The fields following the size field are variable: if the mime type/subtype is message/rfc822, the contained message's envelope information, body structure data, and number of lines of text; if the mime type is text, the number of lines of text. Extension fields may also be included; if present, they are: the MD5 hash of the body, body disposition, body language.""" s = [] for part in self.walk(): if part.get_content_charset() is not None: charset = ("charset", part.get_content_charset()) else: charset = None part_s = [ part.get_main_type(), part.get_subtype(), charset, part.get('Content-Id'), part.get('Content-Description'), part.get('Content-Transfer-Encoding'), str(len(part.as_string())) ] #if part.get_type() == "message/rfc822": # part_s.extend([envelope, body_structure_data, # part.as_string().count("\n")]) #elif part.get_main_type() == "text": if part.get_main_type() == "text": part_s.append(str(part.as_string().count("\n"))) if ext: part_s.extend([ md5(part.as_string()).digest(), part.get('Content-Disposition'), part.get('Content-Language') ]) s.append(part_s) if len(s) == 1: return s[0] return s
def extract_ocr_info(self, pnmfiles): assert self.engine, "must have an engine!" textbits = [] tokens = set() for pnmfile in pnmfiles: preserve = False fhash = md5(open(pnmfile).read()).hexdigest() if fhash in self.cache: self.hits += 1 ctext, ctokens = self.cache[fhash] else: self.misses += 1 if self.engine.program: try: ctext = self.engine.extract_text(pnmfile).lower() except SystemError, msg: print >> sys.stderr, msg preserve = True ctext = "" else: # We should not get here if no OCR is enabled. If it # is enabled and we have no program, its OK to spew lots # of warnings - they should either disable OCR (it is by # default), or fix their config. print >> sys.stderr, \ "No OCR program '%s' available - can't get text!" \ % (self.engine.engine_name,) ctext = "" ctokens = set() if not ctext.strip(): # Lots of spam now contains images in which it is # difficult or impossible (using ocrad) to find any # text. Make a note of that. ctokens.add("image-text:no text found") else: nlines = len(ctext.strip().split("\n")) if nlines: ctokens.add("image-text-lines:%d" % int(log2(nlines))) self.cache[fhash] = (ctext, ctokens)
def main(): try: opts, args = getopt.getopt(sys.argv[1:], 'dhgn:s:v', ['help']) except getopt.error as msg: usage(1, msg) doglob = False n = None verbose = False delete_dups = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt == '-g': doglob = True elif opt == '-s': random.seed(int(arg)) elif opt == '-n': n = int(arg) elif opt == '-v': verbose = True elif opt == '-d': delete_dups = True if n is None or n <= 1: usage(1, "an -n value > 1 is required") if len(args) < 2: usage(1, "input mbox name and output base path are required") inputpaths, outputbasepath = args[:-1], args[-1] outdirs = [outputbasepath + ("%d" % i) for i in range(1, n+1)] for dir in outdirs: if not os.path.isdir(dir): os.makedirs(dir) counter = 0 cksums = set() skipped = 0 for inputpath in inputpaths: if doglob: inpaths = glob.glob(inputpath) else: inpaths = [inputpath] for inpath in inpaths: mbox = mboxutils.getmbox(inpath) for msg in mbox: astext = str(msg) cksum = md5(astext).hexdigest() if delete_dups and cksum in cksums: skipped += 1 continue cksums.add(cksum) i = random.randrange(n) counter += 1 msgfile = open('%s/%d' % (outdirs[i], counter), 'wb') msgfile.write(astext) msgfile.close() if verbose: if counter % 100 == 0: sys.stdout.write('.') sys.stdout.flush() if verbose: print() print(counter, "messages split into", n, "directories") if skipped: print("skipped", skipped, "duplicate messages")
os.makedirs(dir) counter = 0 cksums = set() skipped = 0 for inputpath in inputpaths: if doglob: inpaths = glob.glob(inputpath) else: inpaths = [inputpath] for inpath in inpaths: mbox = mboxutils.getmbox(inpath) for msg in mbox: astext = str(msg) cksum = md5(astext).hexdigest() if delete_dups and cksum in cksums: skipped += 1 continue cksums.add(cksum) i = random.randrange(n) #assert astext.endswith('\n') counter += 1 msgfile = open('%s/%d' % (outdirs[i], counter), 'wb') msgfile.write(astext) msgfile.close() if verbose: if counter % 100 == 0: sys.stdout.write('.') sys.stdout.flush()