def unflow(self, convert_lf=False): """Unflows text of type format=flowed. By default, lines ending in LF (mbox imports) are not converted to CRLF, and thus not unflowed. This is to be consistent with previous versions of Pony Mail, and can be enabled for any new installations that that not reimaging their database. """ if self.string: if self.flowed: # Use provider character set or fall back to our sane default. character_set = self.character_set or DEFAULT_CHARACTER_SET # Convert lone LF to CRLF if found if convert_lf: fixed_string = "\r\n".join( [x.rstrip("\r") for x in self.string.split("\n")]) conversion_was_needed = fixed_string != self.string else: fixed_string = self.string flow_fixed = formatflowed.convertToWrapped( fixed_string.encode(character_set, errors="ignore"), wrap_fixed=False, character_set=character_set, ) # If we "upconverted" from LF to CRLF, convert back after flow decoding if convert_lf and conversion_was_needed: flow_fixed = "\n".join( [x.rstrip("\r") for x in self.string.split("\n")]) return flow_fixed return self.string
def compute_updates(self, lid, private, msg): """Determine what needs to be sent to the archiver. :param lid: The list id :param msg: The message object. :return None if the message could not be parsed """ ojson = None if not lid: lid= msg.get('list-id') if self.cropout: crops = self.cropout.split(" ") # Regex replace? if len(crops) == 2: lid = re.sub(crops[0], crops[1], lid) # Standard crop out? else: lid = lid.replace(self.cropout, "") defaultEmptyString = lambda value: value and str(value) or "" msg_metadata = dict([(k, defaultEmptyString(msg.get(k))) for k in self.keys]) mid = hashlib.sha224(str("%s-%s" % (lid, msg_metadata['archived-at'])).encode('utf-8')).hexdigest() + "@" + (lid if lid else "none") for key in ['to','from','subject','message-id']: try: hval = "" if msg_metadata.get(key): for t in email.header.decode_header(msg_metadata[key]): if t[1] == None or t[1].find("8bit") != -1: hval += t[0].decode('utf-8') if type(t[0]) is bytes else t[0] else: hval += t[0].decode(t[1],errors='ignore') msg_metadata[key] = hval except Exception as err: print("Could not decode headers, ignoring..: %s" % err) if not msg_metadata.get('message-id'): msg_metadata['message-id'] = mid mdate = None uid_mdate = 0 # mdate for UID generation try: mdate = email.utils.parsedate_tz(msg_metadata.get('date')) uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid except: pass if not mdate and msg_metadata.get('archived-at'): mdate = email.utils.parsedate_tz(msg_metadata.get('archived-at')) elif not mdate: print("Date (%s) seems totally wrong, setting to _now_ instead." % mdate) mdate = time.gmtime() # Get a standard 9-tuple mdate = mdate + (0, ) # Fake a TZ (10th element) mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate))) body = self.msgbody(msg) try: if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1: body = convertToWrapped(body, character_set="utf-8") if isinstance(body, str): body = body.encode('utf-8') except Exception as err: try: body = body.decode(chardet.detect(body)['encoding']) except Exception as err: try: body = body.decode('latin-1') except: try: if isinstance(body, str): body = body.encode('utf-8') except: body = None attachments, contents = self.msgfiles(msg) irt = "" if body is not None or attachments: pmid = mid try: # Use full message as bytes for mid? if archiver_generator == "full": mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid) elif archiver_generator == "medium": xbody = body if type(body) is bytes else body.encode('ascii', 'ignore') xbody += bytes(lid, encoding='ascii') xbody += bytes(mdatestring, encoding='ascii') mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid) else: # Or revert to the old way? mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid) except Exception as err: if logger: logger.warn("Could not generate MID: %s" % err) mid = pmid if 'in-reply-to' in msg_metadata: try: try: irt = "".join(msg_metadata['in-reply-to']) except: irt = msg_metadata.get('in-reply-to').__str__() except: irt = "" ojson = { 'from_raw': msg_metadata['from'], 'from': msg_metadata['from'], 'to': msg_metadata['to'], 'subject': msg_metadata['subject'], 'message-id': msg_metadata['message-id'], 'mid': mid, 'cc': msg_metadata.get('cc'), 'epoch': email.utils.mktime_tz(mdate), 'list': lid, 'list_raw': lid, 'date': mdatestring, 'private': private, 'references': msg_metadata['references'], 'in-reply-to': irt, 'body': body.decode('utf-8', 'replace') if type(body) is bytes else body, 'attachments': attachments } self.msg_metadata = msg_metadata self.irt = irt return ojson, contents
def compute_updates(self, lid, private, msg): """Determine what needs to be sent to the archiver. :param lid: The list id :param msg: The message object. :return None if the message could not be parsed """ ojson = None if not lid: lid = normalize_lid(msg.get('list-id')) if self.cropout: crops = self.cropout.split(" ") # Regex replace? if len(crops) == 2: lid = re.sub(crops[0], crops[1], lid) # Standard crop out? else: lid = lid.replace(self.cropout, "") defaultEmptyString = lambda value: value and str(value) or "" msg_metadata = dict([(k, defaultEmptyString(msg.get(k))) for k in self.keys]) mid = hashlib.sha224( str("%s-%s" % (lid, msg_metadata['archived-at'])).encode( 'utf-8')).hexdigest() + "@" + (lid if lid else "none") for key in ['to', 'from', 'subject', 'message-id']: try: hval = "" if msg_metadata.get(key): for t in email.header.decode_header(msg_metadata[key]): if t[1] == None or t[1].find("8bit") != -1: hval += t[0].decode('utf-8') if type( t[0]) is bytes else t[0] else: hval += t[0].decode(t[1], errors='ignore') msg_metadata[key] = hval except Exception as err: print("Could not decode headers, ignoring..: %s" % err) mdate = None try: mdate = email.utils.parsedate_tz(msg_metadata.get('date')) except: pass if not mdate and msg_metadata.get('archived-at'): mdate = email.utils.parsedate_tz(msg_metadata.get('archived-at')) elif not mdate: print("Date (%s) seems totally wrong, setting to _now_ instead." % mdate) mdate = time.gmtime() # Get a standard 9-tuple mdate = mdate + (0, ) # Fake a TZ (10th element) # mdate calculations are all done, prepare the index entry epoch = email.utils.mktime_tz(mdate) mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch)) body = self.msgbody(msg) try: if 'content-type' in msg_metadata and msg_metadata[ 'content-type'].find("flowed") != -1: body = convertToWrapped(body, character_set="utf-8") if isinstance(body, str): body = body.encode('utf-8') except Exception as err: try: body = body.decode(chardet.detect(body)['encoding']) except Exception as err: try: body = body.decode('latin-1') except: try: if isinstance(body, str): body = body.encode('utf-8') except: body = None attachments, contents = self.msgfiles(msg) irt = "" if body is not None or attachments: pmid = mid try: if archiver_generator == "full": mid = generators.full(msg, body, lid, attachments) elif archiver_generator == "medium": mid = generators.medium(msg, body, lid, attachments) elif archiver_generator == "cluster": mid = generators.cluster(msg, body, lid, attachments) else: mid = generators.legacy(msg, body, lid, attachments) except Exception as err: if logger: logger.warn("Could not generate MID: %s" % err) mid = pmid if 'in-reply-to' in msg_metadata: try: try: irt = "".join(msg_metadata['in-reply-to']) except: irt = msg_metadata.get('in-reply-to').__str__() except: irt = "" ojson = { 'from_raw': msg_metadata['from'], 'from': msg_metadata['from'], 'to': msg_metadata['to'], 'subject': msg_metadata['subject'], 'message-id': msg_metadata['message-id'], 'mid': mid, 'cc': msg_metadata.get('cc'), 'epoch': epoch, 'list': lid, 'list_raw': lid, 'date': mdatestring, 'private': private, 'references': msg_metadata['references'], 'in-reply-to': irt, 'body': body.decode('utf-8', 'replace') if type(body) is bytes else body, 'attachments': attachments } self.msg_metadata = msg_metadata self.irt = irt return ojson, contents
def compute_updates(self, lid, private, msg): """Determine what needs to be sent to the archiver. :param lid: The list id :param private: Whether privately archived email or not (bool) :param msg: The message object :return None if the message could not be parsed, otherwise a four-tuple consisting of: the digested email as a dict, its attachments, its metadata fields and any in-reply-to data found. """ ojson = None if not lid: lid = normalize_lid(msg.get('list-id')) if self.cropout: crops = self.cropout.split(" ") # Regex replace? if len(crops) == 2: lid = re.sub(crops[0], crops[1], lid) # Standard crop out? else: lid = lid.replace(self.cropout, "") defaultEmptyString = lambda value: value and str(value) or "" msg_metadata = dict([(k, defaultEmptyString(msg.get(k))) for k in self.keys]) mid = hashlib.sha224( str("%s-%s" % (lid, msg_metadata['archived-at'])).encode( 'utf-8')).hexdigest() + "@" + (lid if lid else "none") for key in ['to', 'from', 'subject', 'message-id']: try: hval = "" if msg_metadata.get(key): for t in email.header.decode_header(msg_metadata[key]): if t[1] == None or t[1].find("8bit") != -1: hval += t[0].decode('utf-8') if type( t[0]) is bytes else t[0] else: hval += t[0].decode(t[1], errors='ignore') msg_metadata[key] = hval except Exception as err: print("Could not decode headers, ignoring..: %s" % err) mdate = None try: mdate = email.utils.parsedate_tz(msg_metadata.get('date')) except: pass if not mdate and msg_metadata.get('archived-at'): mdate = email.utils.parsedate_tz(msg_metadata.get('archived-at')) elif not mdate: print("Date (%s) seems totally wrong, setting to _now_ instead." % mdate) mdate = time.gmtime() # Get a standard 9-tuple mdate = mdate + (0, ) # Fake a TZ (10th element) # mdate calculations are all done, prepare the index entry epoch = email.utils.mktime_tz(mdate) mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch)) body = self.msgbody(msg) saved_body = None # for format=flowed try: if 'content-type' in msg_metadata and msg_metadata[ 'content-type'].find("flowed") != -1: saved_body = body # so we can redo it properly later # N.B. the convertToWrapped call usually fails, because body is a generally a string here # However sometimes body is bytes at this point in which case it works body = formatflowed.convertToWrapped(body, character_set="utf-8") # DO NOT FIX IT -- otherwise generated MIDs will change # The code now applies the formatting properly later if isinstance(body, str): body = body.encode('utf-8') except Exception: try: body = body.decode(chardet.detect(body)['encoding']) except Exception: try: body = body.decode('latin-1') except: try: if isinstance(body, str): body = body.encode('utf-8') except: body = None attachments, contents = self.msgfiles(msg) irt = "" if body is not None or attachments: pmid = mid try: mid = generators.generate(self.generator, msg, body, lid, attachments) except Exception as err: if logger: # N.B. use .get just in case there is no message-id logger.info( "Could not generate MID using %s: %s. MSGID: %s", self.generator, err, msg_metadata.get('message-id', '?').strip()) mid = pmid if 'in-reply-to' in msg_metadata: try: try: irt = "".join(msg_metadata['in-reply-to']) except: irt = msg_metadata.get('in-reply-to').__str__() except: irt = "" if not self.skipff and 'content-type' in msg_metadata and msg_metadata[ 'content-type'].find("flowed") != -1: if isinstance(saved_body, str): saved_body = saved_body.encode('utf-8', 'replace') try: # Allow wrapping to be done on the client display by unwrapping # to a single long line. # The value 2000 should be more than enough for most email paragraphs. # body = formatflowed.convertToWrapped(to_crlf(saved_body), width=2000, wrap_fixed=False, character_set="utf-8") # formatflowed requires CRLF line endings, but generates LF endings... # TEMP: disable conversion until can work out how to fix tests body = formatflowed.convertToWrapped(saved_body, width=2000, wrap_fixed=False, character_set="utf-8") except: pass # Don't try to recover ojson = { 'from_raw': msg_metadata['from'], 'from': msg_metadata['from'], 'to': msg_metadata['to'], 'subject': msg_metadata['subject'], 'message-id': msg_metadata['message-id'], 'mid': mid, 'cc': msg_metadata.get('cc'), 'epoch': epoch, 'list': lid, 'list_raw': lid, 'date': mdatestring, 'private': private, 'references': msg_metadata['references'], 'in-reply-to': irt, 'body': body.decode('utf-8', 'replace') if type(body) is bytes else body, 'attachments': attachments } return ojson, contents, msg_metadata, irt
def run(self): global block, y, es, lists, baddies, config, resendTo, timeout ja = [] jas = [] print("Thread started") mla = None ml = "" mboxfile = "" filename = "" xlist_override = None while len(lists) > 0: print("%u elements left to slurp" % len(lists)) block.acquire() try: mla = lists.pop(0) except Exception as err: print("Could not pop list: %s" % err) block.release() return if not mla: print("Nothing more to do here") block.release() return block.release() y += 1 EY = 1980 EM = 1 stime = time.time() dFile = False if filebased: tmpname = mla[0] filename = mla[0] xlist_override = mla[1] if filename.find(".gz") != -1: print("Decompressing %s..." % filename) try: with open(filename, "rb") as bf: bmd = bf.read() bf.close() bmd = gzip.decompress(bmd) tmpfile = tempfile.NamedTemporaryFile(mode='w+b', buffering=1, delete=False) tmpfile.write(bmd) tmpfile.flush() tmpfile.close() tmpname = tmpfile.name filename = tmpname dFile = True # Slated for deletion upon having been read print("%s -> %u bytes" % (tmpname, len(bmd))) except Exception as err: print("This wasn't a gzip file: %s" % err ) print("Slurping %s" % filename) else: ml = mla[0] mboxfile = mla[1] xlist_override = list_override print("Slurping %s/%s" % (ml, mboxfile)) m = re.match(r"(\d\d\d\d)(\d\d)", mboxfile) EY = 1997 EM = 1 if m: EY = int(m.group(1)) EM = int(m.group(2)) ctx = urlopen("%s%s/%s" % (source, ml, mboxfile )) inp = ctx.read().decode(ctx.headers.get_content_charset() or 'utf-8', errors='ignore') tmpname = hashlib.sha224(("%f-%f-%s-%s.mbox" % (random.random(), time.time(), ml, mboxfile)).encode('utf-8') ).hexdigest() with open(tmpname, "w") as f: f.write(inp) f.close() count = 0 LEY = EY if maildir: messages = mailbox.Maildir(tmpname) else: messages = mailbox.mbox(tmpname) for message in messages: if resendTo: print("Delivering message %s via MTA" % message['message-id'] if 'message-id' in message else '??') s = SMTP('localhost') try: if list_override: message.replace_header('List-ID', list_override) message.replace_header('To', resendTo) except: if list_override: message['List-ID'] = list_override message['cc'] = None s.send_message(message, from_addr=None, to_addrs=(resendTo)) continue if (time.time() - stime > timeout): # break out after N seconds, it shouldn't take this long..! print("Whoa, this is taking way too long, ignoring %s for now" % tmpname) break if 'subject' in message: subject = message['subject'] # Could possibly be None. mid = message['message-id'] lid = message['list-id'] if args.requirelid and (not lid or lid == ""): continue if not lid or lid == "": # Guess list name in absence lid = '.'.join(reversed(ml.split("-"))) + "." + appender # Compact LID to <foo@domain>, discard rest m = re.search(r"(<.+>)", lid) if m: lid = m.group(1) if xlist_override and len(xlist_override) > 3: lid = xlist_override lid = lid.replace("@",".") # we want foo.bar.org, not [email protected] lid = "<%s>" % lid.strip("<>") # We need <> around it! if cropout: crops = cropout.split(" ") # Regex replace? if len(crops) == 2: lid = re.sub(crops[0], crops[1], lid) # Standard crop out? else: lid = lid.replace(cropout, "") date = message['date'] fro = message['from'] to = message['to'] body = msgbody(message) try: if 'content-type' in message and message['content-type'].find("flowed") != -1: body = convertToWrapped(body, character_set="utf-8") if isinstance(body, str): body = body.encode('utf-8') except Exception as err: try: body = body.decode(chardet.detect(body)['encoding']) except Exception as err: try: body = body.decode('latin-1') except: try: if isinstance(body, str): body = body.encode('utf-8') except: body = None okay = True dheader = {} for key in ['to','from','subject','message-id']: try: hval = "" if message.get(key): for t in email.header.decode_header(message[key]): if t[1] == None or t[1].find("8bit") != -1: hval += t[0].decode('utf-8', errors='replace') if type(t[0]) is bytes else t[0] else: hval += t[0].decode(t[1],errors='ignore') dheader[key] = hval else: dheader[key] = "(Unknown)" except Exception as err: print("Could not decode headers, ignoring..: %s" % err) okay = False mdt = "" if not 'date' in message and 'received' in message: print("No Date header found, resorting to Received") m = re.search(r"(\d+ \S+ \d{4} \d\d:\d\d:\d\d ([-+]\d{4})?)", message['received']) if m: mdt = m.group(1) else: mdt = message['date'] mdate = None uid_mdate = 0 try: mdate = email.utils.parsedate_tz(mdt) uid_mdate = email.utils.mktime_tz(mdate) except: pass if not mdate or mdate[0] < (LEY-1): print("Date is wrong or missing here, setting to %s" % ( LEY)) mdate = datetime.datetime(LEY, EM, 1).timetuple() else: LEY = mdate[0] # Gather evidence 'n'stuff! mdatestring = "" try: mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(email.utils.mktime_tz(mdate))) except: okay = False if body and okay and mdate and {'from','subject'} <= set(dheader): # Pipermail transforms from: to something weird - reset that! if piperWeirdness: m = re.match(r"(.+) at ([^(]+) \((.+)\)$", dheader['from']) # Try just 'foo at bar.tld' if 'foo at bar.tld (foo bar)' isn't working if not m: m = re.match(r"(.+) at ([^(]+)$", dheader['from']) if m: dheader['from'] = "%s <%s@%s>" % (m.group(3), m.group(1), m.group(2)) attachments, contents = msgfiles(message) if mid == None or not mid: try: mid = hashlib.sha256(body if type(body) is bytes else body.encode('ascii', errors='ignore')).hexdigest() + "@" + lid + "@" + appender except: if filebased: mid = hashlib.sha256("%f-%f-%s" % (random.random(), time.time(), filename) ).hexdigest()+ "@" + appender else: mid = hashlib.sha256("%f-%f-%s-%s" % (random.random(), time.time(), ml, mboxfile) ).hexdigest()+ "@" + appender print("No MID found, setting to %s" % mid) mid2 = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid) count += 1 mr = "" if 'references' in message: mr = message['references'] irt = "" if 'in-reply-to' in message: try: irt = "".join(message['in-reply-to']) except: irt = message.get('in-reply-to').__str__() json = { 'from_raw': dheader['from'], 'from': dheader['from'], 'to': dheader['to'], 'subject': dheader['subject'], 'cc': message.get('cc'), 'message-id': mid, 'mid': mid2, 'epoch': email.utils.mktime_tz(mdate), 'list': lid, 'list_raw': lid, 'date': mdatestring, 'private': private, 'references': mr, 'in-reply-to': irt, 'body': body.decode('utf-8', errors='replace') if type(body) is bytes else body, 'attachments': attachments } json_source = { 'mid': mid2, 'message-id': mid, 'source': message.as_bytes().decode('utf-8', errors='replace') } ja.append(json) jas.append(json_source) if contents: iname = config.get("elasticsearch", "dbname") if not args.dry: for key in contents: es.index( index=iname, doc_type="attachment", id=key, body = { 'source': contents[key] } ) if len(ja) >= 40: if not args.dry: bulk = BulkThread() bulk.assign(ja, es, 'mbox') bulk.insert() ja = [] if not args.dry: bulks = BulkThread() bulks.assign(jas, es, 'mbox_source') bulks.insert() jas = [] else: baddies += 1 if filebased: print("Parsed %u records from %s" % (count, filename)) if dFile: os.unlink(tmpname) else: print("Parsed %s/%s: %u records from %s" % (ml, mboxfile, count, tmpname)) os.unlink(tmpname) y += count if not args.dry: bulk = BulkThread() bulk.assign(ja, es) bulk.insert() ja = [] if not args.dry: bulks = BulkThread() bulks.assign(jas, es, 'mbox_source') bulks.insert() jas = [] print("Done, %u elements left to slurp" % len(lists))
def run(self): global block, y, es, lists, baddies, config, resendTo ja = [] jas = [] print("Thread started") mla = None ml = "" mboxfile = "" filename = "" xlist_override = None while len(lists) > 0: print("%u elements left to slurp" % len(lists)) block.acquire() try: mla = lists.pop(0) except Exception as err: print("Could not pop list: %s" % err) block.release() return if not mla: print("Nothing more to do here") block.release() return block.release() y += 1 EY = 1980 EM = 1 stime = time.time() if filebased: tmpname = mla[0] filename = mla[0] xlist_override = mla[1] print("Slurping %s" % filename) else: ml = mla[0] mboxfile = mla[1] xlist_override = list_override print("Slurping %s/%s" % (ml, mboxfile)) m = re.match(r"(\d\d\d\d)(\d\d)", mboxfile) EY = 1997 EM = 1 if m: EY = int(m.group(1)) EM = int(m.group(2)) ctx = urlopen("%s%s/%s" % (source, ml, mboxfile)) inp = ctx.read().decode(ctx.headers.get_content_charset() or 'utf-8', errors='ignore') tmpname = hashlib.sha224( ("%f-%f-%s-%s.mbox" % (random.random(), time.time(), ml, mboxfile)).encode('utf-8')).hexdigest() with open(tmpname, "w") as f: f.write(inp) f.close() count = 0 LEY = EY for message in mailbox.mbox(tmpname): if resendTo: print("Delivering message %s via MTA" % message['message-id'] if 'message-id' in message else '??') s = SMTP('localhost') try: if list_override: message.replace_header('List-ID', list_override) message.replace_header('To', resendTo) except: if list_override: message['List-ID'] = list_override message['cc'] = None s.send_message(message, from_addr=None, to_addrs=(resendTo)) continue if ( time.time() - stime > 360 ): # break out after 6 minutes, it shouldn't take this long..! print( "Whoa, this is taking way too long, ignoring %s for now" % tmpname) break if 'subject' in message: subject = message['subject'] # Could possibly be None. mid = message['message-id'] lid = message['list-id'] if not lid or lid == "": # Guess list name in absence lid = '.'.join(reversed( ml.split("-"))) + "." + appender # Compact LID to <foo@domain>, discard rest m = re.search(r"(<.+>)", lid) if m: lid = m.group(1) if xlist_override and len(xlist_override) > 3: lid = xlist_override lid = lid.replace( "@", ".") # we want foo.bar.org, not [email protected] lid = "<%s>" % lid.strip("<>") # We need <> around it! if cropout: crops = cropout.split(" ") # Regex replace? if len(crops) == 2: lid = re.sub(crops[0], crops[1], lid) # Standard crop out? else: lid = lid.replace(cropout, "") date = message['date'] fro = message['from'] to = message['to'] body = msgbody(message) try: if 'content-type' in message and message[ 'content-type'].find("flowed") != -1: body = convertToWrapped(body, character_set="utf-8") if isinstance(body, str): body = body.encode('utf-8') except Exception as err: try: body = body.decode( chardet.detect(body)['encoding']) except Exception as err: try: body = body.decode('latin-1') except: try: if isinstance(body, str): body = body.encode('utf-8') except: body = None okay = True dheader = {} for key in ['to', 'from', 'subject', 'message-id']: try: hval = "" if message.get(key): for t in email.header.decode_header( message[key]): if t[1] == None or t[1].find("8bit") != -1: hval += t[0].decode( 'utf-8', errors='replace') if type( t[0]) is bytes else t[0] else: hval += t[0].decode(t[1], errors='ignore') dheader[key] = hval else: dheader[key] = "(Unknown)" except Exception as err: print("Could not decode headers, ignoring..: %s" % err) okay = False mdt = "" if not 'date' in message and 'received' in message: print("No Date header found, resorting to Received") m = re.search( r"(\d+ \S+ \d{4} \d\d:\d\d:\d\d ([-+]\d{4})?)", message['received']) if m: mdt = m.group(1) else: mdt = message['date'] mdate = None uid_mdate = 0 try: mdate = email.utils.parsedate_tz(mdt) uid_mdate = email.utils.mktime_tz(mdate) except: pass if not mdate or mdate[0] < (LEY - 1): print("Date is wrong or missing here, setting to %s" % (LEY)) mdate = datetime.datetime(LEY, EM, 1).timetuple() else: LEY = mdate[0] # Gather evidence 'n'stuff! mdatestring = "" try: mdatestring = time.strftime( "%Y/%m/%d %H:%M:%S", time.localtime(email.utils.mktime_tz(mdate))) except: okay = False if body and okay and mdate and {'from', 'subject' } <= set(dheader): # Pipermail transforms from: to something weird - reset that! if piperWeirdness: m = re.match(r"(.+) at ([^(]+) \((.+)\)$", dheader['from']) # Try just 'foo at bar.tld' if 'foo at bar.tld (foo bar)' isn't working if not m: m = re.match(r"(.+) at ([^(]+)$", dheader['from']) if m: dheader['from'] = "%s <%s@%s>" % ( m.group(3), m.group(1), m.group(2)) attachments, contents = msgfiles(message) if mid == None or not mid: try: mid = hashlib.sha256( body if type(body) is bytes else body. encode('ascii', errors='ignore') ).hexdigest() + "@" + lid + "@" + appender except: if filebased: mid = hashlib.sha256( "%f-%f-%s" % (random.random(), time.time(), filename )).hexdigest() + "@" + appender else: mid = hashlib.sha256("%f-%f-%s-%s" % ( random.random(), time.time(), ml, mboxfile)).hexdigest() + "@" + appender print("No MID found, setting to %s" % mid) mid2 = "%s@%s@%s" % (hashlib.sha224( body if type(body) is bytes else body. encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid) count += 1 mr = "" if 'references' in message: mr = message['references'] irt = "" if 'in-reply-to' in message: try: irt = "".join(message['in-reply-to']) except: irt = message.get('in-reply-to').__str__() json = { 'from_raw': dheader['from'], 'from': dheader['from'], 'to': dheader['to'], 'subject': dheader['subject'], 'cc': message.get('cc'), 'message-id': mid, 'mid': mid2, 'epoch': email.utils.mktime_tz(mdate), 'list': lid, 'list_raw': lid, 'date': mdatestring, 'private': private, 'references': mr, 'in-reply-to': irt, 'body': body.decode('utf-8', errors='replace') if type(body) is bytes else body, 'attachments': attachments } json_source = { 'mid': mid2, 'message-id': mid, 'source': message.as_bytes().decode('utf-8', errors='replace') } ja.append(json) jas.append(json_source) if contents: iname = config.get("elasticsearch", "dbname") if not args.dry: for key in contents: es.index(index=iname, doc_type="attachment", id=key, body={'source': contents[key]}) if len(ja) >= 40: if not args.dry: bulk = BulkThread() bulk.assign(ja, es, 'mbox') bulk.insert() ja = [] if not args.dry: bulks = BulkThread() bulks.assign(jas, es, 'mbox_source') bulks.insert() jas = [] else: baddies += 1 if filebased: print("Parsed %u records from %s" % (count, filename)) else: print("Parsed %s/%s: %u records from %s" % (ml, mboxfile, count, tmpname)) os.unlink(tmpname) y += count if not args.dry: bulk = BulkThread() bulk.assign(ja, es) bulk.insert() ja = [] if not args.dry: bulks = BulkThread() bulks.assign(jas, es, 'mbox_source') bulks.insert() jas = [] print("Done, %u elements left to slurp" % len(lists))