示例#1
0
 def unflow(self, convert_lf=False):
     """Unflows text of type format=flowed.
        By default, lines ending in LF (mbox imports) are not converted to CRLF, and thus
        not unflowed. This is to be consistent with previous versions of Pony Mail, and
        can be enabled for any new installations that that not reimaging their database.
        """
     if self.string:
         if self.flowed:
             # Use provider character set or fall back to our sane default.
             character_set = self.character_set or DEFAULT_CHARACTER_SET
             # Convert lone LF to CRLF if found
             if convert_lf:
                 fixed_string = "\r\n".join(
                     [x.rstrip("\r") for x in self.string.split("\n")])
                 conversion_was_needed = fixed_string != self.string
             else:
                 fixed_string = self.string
             flow_fixed = formatflowed.convertToWrapped(
                 fixed_string.encode(character_set, errors="ignore"),
                 wrap_fixed=False,
                 character_set=character_set,
             )
             # If we "upconverted" from LF to CRLF, convert back after flow decoding
             if convert_lf and conversion_was_needed:
                 flow_fixed = "\n".join(
                     [x.rstrip("\r") for x in self.string.split("\n")])
             return flow_fixed
     return self.string
示例#2
0
    def compute_updates(self, lid, private, msg):
        """Determine what needs to be sent to the archiver.

        :param lid: The list id
        :param msg: The message object.

        :return None if the message could not be parsed
        """

        ojson = None
        if not lid:
            lid= msg.get('list-id')
        if self.cropout:
            crops = self.cropout.split(" ")
            # Regex replace?
            if len(crops) == 2:
                lid = re.sub(crops[0], crops[1], lid)
            # Standard crop out?
            else:
                lid = lid.replace(self.cropout, "")
        
        defaultEmptyString = lambda value: value and str(value) or ""
        msg_metadata = dict([(k, defaultEmptyString(msg.get(k))) for k in self.keys])
        mid = hashlib.sha224(str("%s-%s" % (lid, msg_metadata['archived-at'])).encode('utf-8')).hexdigest() + "@" + (lid if lid else "none")
        for key in ['to','from','subject','message-id']:
            try:
                hval = ""
                if msg_metadata.get(key):
                    for t in email.header.decode_header(msg_metadata[key]):
                        if t[1] == None or t[1].find("8bit") != -1:
                            hval += t[0].decode('utf-8') if type(t[0]) is bytes else t[0]
                        else:
                            hval += t[0].decode(t[1],errors='ignore')
                    msg_metadata[key] = hval
            except Exception as err:
                print("Could not decode headers, ignoring..: %s" % err)
        if not msg_metadata.get('message-id'):
            msg_metadata['message-id'] = mid
        mdate = None
        uid_mdate = 0 # mdate for UID generation
        try:
            mdate = email.utils.parsedate_tz(msg_metadata.get('date'))
            uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
        except:
            pass
        if not mdate and msg_metadata.get('archived-at'):
            mdate = email.utils.parsedate_tz(msg_metadata.get('archived-at'))
        elif not mdate:
            print("Date (%s) seems totally wrong, setting to _now_ instead." % mdate)
            mdate = time.gmtime() # Get a standard 9-tuple
            mdate = mdate + (0, ) # Fake a TZ (10th element)
        mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate)))
        body = self.msgbody(msg)
        try:
            if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1:
                body = convertToWrapped(body, character_set="utf-8")
            if isinstance(body, str):
                body = body.encode('utf-8')
        except Exception as err:
            try:
                body = body.decode(chardet.detect(body)['encoding'])
            except Exception as err:
                try:
                    body = body.decode('latin-1')
                except:
                    try:
                        if isinstance(body, str):
                            body = body.encode('utf-8')
                    except:
                        body = None

        attachments, contents = self.msgfiles(msg)
        irt = ""
        if body is not None or attachments:
            pmid = mid
            try:
                # Use full message as bytes for mid?
                if archiver_generator == "full":
                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
                elif archiver_generator == "medium":
                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
                    xbody += bytes(lid, encoding='ascii')
                    xbody += bytes(mdatestring, encoding='ascii')
                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
                else:
                    # Or revert to the old way?
                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
            except Exception as err:
                if logger:
                    logger.warn("Could not generate MID: %s" % err)
                mid = pmid
            if 'in-reply-to' in msg_metadata:
                try:
                    try:
                        irt = "".join(msg_metadata['in-reply-to'])
                    except:
                        irt = msg_metadata.get('in-reply-to').__str__()
                except:
                    irt = ""
            ojson = {
                'from_raw': msg_metadata['from'],
                'from': msg_metadata['from'],
                'to': msg_metadata['to'],
                'subject': msg_metadata['subject'],
                'message-id': msg_metadata['message-id'],
                'mid': mid,
                'cc': msg_metadata.get('cc'),
                'epoch': email.utils.mktime_tz(mdate),
                'list': lid,
                'list_raw': lid,
                'date': mdatestring,
                'private': private,
                'references': msg_metadata['references'],
                'in-reply-to': irt,
                'body': body.decode('utf-8', 'replace') if type(body) is bytes else body,
                'attachments': attachments
            }

        self.msg_metadata = msg_metadata
        self.irt = irt

        return  ojson, contents
示例#3
0
    def compute_updates(self, lid, private, msg):
        """Determine what needs to be sent to the archiver.

        :param lid: The list id
        :param msg: The message object.

        :return None if the message could not be parsed
        """

        ojson = None
        if not lid:
            lid = normalize_lid(msg.get('list-id'))
        if self.cropout:
            crops = self.cropout.split(" ")
            # Regex replace?
            if len(crops) == 2:
                lid = re.sub(crops[0], crops[1], lid)
            # Standard crop out?
            else:
                lid = lid.replace(self.cropout, "")

        defaultEmptyString = lambda value: value and str(value) or ""
        msg_metadata = dict([(k, defaultEmptyString(msg.get(k)))
                             for k in self.keys])
        mid = hashlib.sha224(
            str("%s-%s" % (lid, msg_metadata['archived-at'])).encode(
                'utf-8')).hexdigest() + "@" + (lid if lid else "none")
        for key in ['to', 'from', 'subject', 'message-id']:
            try:
                hval = ""
                if msg_metadata.get(key):
                    for t in email.header.decode_header(msg_metadata[key]):
                        if t[1] == None or t[1].find("8bit") != -1:
                            hval += t[0].decode('utf-8') if type(
                                t[0]) is bytes else t[0]
                        else:
                            hval += t[0].decode(t[1], errors='ignore')
                    msg_metadata[key] = hval
            except Exception as err:
                print("Could not decode headers, ignoring..: %s" % err)
        mdate = None
        try:
            mdate = email.utils.parsedate_tz(msg_metadata.get('date'))
        except:
            pass
        if not mdate and msg_metadata.get('archived-at'):
            mdate = email.utils.parsedate_tz(msg_metadata.get('archived-at'))
        elif not mdate:
            print("Date (%s) seems totally wrong, setting to _now_ instead." %
                  mdate)
            mdate = time.gmtime()  # Get a standard 9-tuple
            mdate = mdate + (0, )  # Fake a TZ (10th element)

        # mdate calculations are all done, prepare the index entry
        epoch = email.utils.mktime_tz(mdate)
        mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
        body = self.msgbody(msg)
        try:
            if 'content-type' in msg_metadata and msg_metadata[
                    'content-type'].find("flowed") != -1:
                body = convertToWrapped(body, character_set="utf-8")
            if isinstance(body, str):
                body = body.encode('utf-8')
        except Exception as err:
            try:
                body = body.decode(chardet.detect(body)['encoding'])
            except Exception as err:
                try:
                    body = body.decode('latin-1')
                except:
                    try:
                        if isinstance(body, str):
                            body = body.encode('utf-8')
                    except:
                        body = None

        attachments, contents = self.msgfiles(msg)
        irt = ""
        if body is not None or attachments:
            pmid = mid
            try:
                if archiver_generator == "full":
                    mid = generators.full(msg, body, lid, attachments)
                elif archiver_generator == "medium":
                    mid = generators.medium(msg, body, lid, attachments)
                elif archiver_generator == "cluster":
                    mid = generators.cluster(msg, body, lid, attachments)
                else:
                    mid = generators.legacy(msg, body, lid, attachments)
            except Exception as err:
                if logger:
                    logger.warn("Could not generate MID: %s" % err)
                mid = pmid
            if 'in-reply-to' in msg_metadata:
                try:
                    try:
                        irt = "".join(msg_metadata['in-reply-to'])
                    except:
                        irt = msg_metadata.get('in-reply-to').__str__()
                except:
                    irt = ""
            ojson = {
                'from_raw':
                msg_metadata['from'],
                'from':
                msg_metadata['from'],
                'to':
                msg_metadata['to'],
                'subject':
                msg_metadata['subject'],
                'message-id':
                msg_metadata['message-id'],
                'mid':
                mid,
                'cc':
                msg_metadata.get('cc'),
                'epoch':
                epoch,
                'list':
                lid,
                'list_raw':
                lid,
                'date':
                mdatestring,
                'private':
                private,
                'references':
                msg_metadata['references'],
                'in-reply-to':
                irt,
                'body':
                body.decode('utf-8', 'replace')
                if type(body) is bytes else body,
                'attachments':
                attachments
            }

        self.msg_metadata = msg_metadata
        self.irt = irt

        return ojson, contents
示例#4
0
    def compute_updates(self, lid, private, msg):
        """Determine what needs to be sent to the archiver.

        :param lid: The list id
        :param private: Whether privately archived email or not (bool)
        :param msg: The message object

        :return None if the message could not be parsed, otherwise a four-tuple consisting of:
                the digested email as a dict, its attachments, its metadata fields and any
                in-reply-to data found.
        """

        ojson = None
        if not lid:
            lid = normalize_lid(msg.get('list-id'))
        if self.cropout:
            crops = self.cropout.split(" ")
            # Regex replace?
            if len(crops) == 2:
                lid = re.sub(crops[0], crops[1], lid)
            # Standard crop out?
            else:
                lid = lid.replace(self.cropout, "")

        defaultEmptyString = lambda value: value and str(value) or ""
        msg_metadata = dict([(k, defaultEmptyString(msg.get(k)))
                             for k in self.keys])
        mid = hashlib.sha224(
            str("%s-%s" % (lid, msg_metadata['archived-at'])).encode(
                'utf-8')).hexdigest() + "@" + (lid if lid else "none")
        for key in ['to', 'from', 'subject', 'message-id']:
            try:
                hval = ""
                if msg_metadata.get(key):
                    for t in email.header.decode_header(msg_metadata[key]):
                        if t[1] == None or t[1].find("8bit") != -1:
                            hval += t[0].decode('utf-8') if type(
                                t[0]) is bytes else t[0]
                        else:
                            hval += t[0].decode(t[1], errors='ignore')
                    msg_metadata[key] = hval
            except Exception as err:
                print("Could not decode headers, ignoring..: %s" % err)
        mdate = None
        try:
            mdate = email.utils.parsedate_tz(msg_metadata.get('date'))
        except:
            pass
        if not mdate and msg_metadata.get('archived-at'):
            mdate = email.utils.parsedate_tz(msg_metadata.get('archived-at'))
        elif not mdate:
            print("Date (%s) seems totally wrong, setting to _now_ instead." %
                  mdate)
            mdate = time.gmtime()  # Get a standard 9-tuple
            mdate = mdate + (0, )  # Fake a TZ (10th element)

        # mdate calculations are all done, prepare the index entry
        epoch = email.utils.mktime_tz(mdate)
        mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
        body = self.msgbody(msg)
        saved_body = None  # for format=flowed
        try:
            if 'content-type' in msg_metadata and msg_metadata[
                    'content-type'].find("flowed") != -1:
                saved_body = body  # so we can redo it properly later
                # N.B. the convertToWrapped call usually fails, because body is a generally a string here
                # However sometimes body is bytes at this point in which case it works
                body = formatflowed.convertToWrapped(body,
                                                     character_set="utf-8")
                # DO NOT FIX IT -- otherwise generated MIDs will change
                # The code now applies the formatting properly later
            if isinstance(body, str):
                body = body.encode('utf-8')
        except Exception:
            try:
                body = body.decode(chardet.detect(body)['encoding'])
            except Exception:
                try:
                    body = body.decode('latin-1')
                except:
                    try:
                        if isinstance(body, str):
                            body = body.encode('utf-8')
                    except:
                        body = None

        attachments, contents = self.msgfiles(msg)
        irt = ""
        if body is not None or attachments:
            pmid = mid
            try:
                mid = generators.generate(self.generator, msg, body, lid,
                                          attachments)
            except Exception as err:
                if logger:
                    # N.B. use .get just in case there is no message-id
                    logger.info(
                        "Could not generate MID using %s: %s. MSGID: %s",
                        self.generator, err,
                        msg_metadata.get('message-id', '?').strip())
                mid = pmid

            if 'in-reply-to' in msg_metadata:
                try:
                    try:
                        irt = "".join(msg_metadata['in-reply-to'])
                    except:
                        irt = msg_metadata.get('in-reply-to').__str__()
                except:
                    irt = ""

            if not self.skipff and 'content-type' in msg_metadata and msg_metadata[
                    'content-type'].find("flowed") != -1:
                if isinstance(saved_body, str):
                    saved_body = saved_body.encode('utf-8', 'replace')
                try:
                    # Allow wrapping to be done on the client display by unwrapping
                    # to a single long line.
                    # The value 2000 should be more than enough for most email paragraphs.
                    # body = formatflowed.convertToWrapped(to_crlf(saved_body), width=2000, wrap_fixed=False, character_set="utf-8")
                    # formatflowed requires CRLF line endings, but generates LF endings...
                    # TEMP: disable conversion until can work out how to fix tests
                    body = formatflowed.convertToWrapped(saved_body,
                                                         width=2000,
                                                         wrap_fixed=False,
                                                         character_set="utf-8")
                except:
                    pass  # Don't try to recover

            ojson = {
                'from_raw':
                msg_metadata['from'],
                'from':
                msg_metadata['from'],
                'to':
                msg_metadata['to'],
                'subject':
                msg_metadata['subject'],
                'message-id':
                msg_metadata['message-id'],
                'mid':
                mid,
                'cc':
                msg_metadata.get('cc'),
                'epoch':
                epoch,
                'list':
                lid,
                'list_raw':
                lid,
                'date':
                mdatestring,
                'private':
                private,
                'references':
                msg_metadata['references'],
                'in-reply-to':
                irt,
                'body':
                body.decode('utf-8', 'replace')
                if type(body) is bytes else body,
                'attachments':
                attachments
            }

        return ojson, contents, msg_metadata, irt
示例#5
0
    def run(self):
        global block, y, es, lists, baddies, config, resendTo, timeout
        ja = []
        jas = []
        print("Thread started")
        mla = None
        ml = ""
        mboxfile = ""
        filename = ""
        xlist_override = None
    
        while len(lists) > 0:
            print("%u elements left to slurp" % len(lists))
            block.acquire()
            try:
                mla = lists.pop(0)
            except Exception as err:
                print("Could not pop list: %s" % err)
                block.release()
                return
            if not mla:
                print("Nothing more to do here")
                block.release()
                return
            block.release()
            y += 1
            EY = 1980
            EM = 1
            stime = time.time()
            dFile = False
            if filebased:
                
                tmpname = mla[0]
                filename = mla[0]
                xlist_override = mla[1]
                if filename.find(".gz") != -1:
                    print("Decompressing %s..." % filename)
                    try:
                        with open(filename, "rb") as bf:
                            bmd = bf.read()
                            bf.close()
                            bmd = gzip.decompress(bmd)
                            tmpfile = tempfile.NamedTemporaryFile(mode='w+b', buffering=1, delete=False)
                            tmpfile.write(bmd)
                            tmpfile.flush()
                            tmpfile.close()
                            tmpname = tmpfile.name
                            filename = tmpname
                            dFile = True # Slated for deletion upon having been read
                            print("%s -> %u bytes" % (tmpname, len(bmd)))
                    except Exception as err:
                        print("This wasn't a gzip file: %s" % err )
                print("Slurping %s" % filename)
            else:
                ml = mla[0]
                mboxfile = mla[1]
                xlist_override = list_override
                print("Slurping %s/%s" % (ml, mboxfile))
                m = re.match(r"(\d\d\d\d)(\d\d)", mboxfile)
                EY = 1997
                EM = 1
                if m:
                    EY = int(m.group(1))
                    EM = int(m.group(2))
                ctx = urlopen("%s%s/%s" % (source, ml, mboxfile ))
                inp = ctx.read().decode(ctx.headers.get_content_charset() or 'utf-8', errors='ignore')
    
                tmpname = hashlib.sha224(("%f-%f-%s-%s.mbox" % (random.random(), time.time(), ml, mboxfile)).encode('utf-8') ).hexdigest()
                with open(tmpname, "w") as f:
                    f.write(inp)
                    f.close()

            count = 0
            LEY = EY

            if maildir:
                messages = mailbox.Maildir(tmpname)
            else:
                messages = mailbox.mbox(tmpname)

            for message in messages:
                if resendTo:
                    print("Delivering message %s via MTA" % message['message-id'] if 'message-id' in message else '??')
                    s = SMTP('localhost')
                    try:
                        if list_override:
                            message.replace_header('List-ID', list_override)
                        message.replace_header('To', resendTo)
                    except:
                        if list_override:
                            message['List-ID'] = list_override
                    message['cc'] = None
                    s.send_message(message, from_addr=None, to_addrs=(resendTo))
                    continue
                if (time.time() - stime > timeout): # break out after N seconds, it shouldn't take this long..!
                    print("Whoa, this is taking way too long, ignoring %s for now" % tmpname)
                    break
                if 'subject' in message:
                    subject = message['subject']       # Could possibly be None.
                    mid = message['message-id']

                    lid = message['list-id']
                    if args.requirelid and (not lid or lid == ""):
                        continue
                    if not lid or lid == "": # Guess list name in absence
                        lid = '.'.join(reversed(ml.split("-"))) + "." + appender
                    
                    # Compact LID to <foo@domain>, discard rest
                    m = re.search(r"(<.+>)", lid)
                    if m:
                        lid = m.group(1)
                    if xlist_override and len(xlist_override) > 3:
                        lid = xlist_override
                    lid = lid.replace("@",".") # we want foo.bar.org, not [email protected]
                    lid = "<%s>" % lid.strip("<>") # We need <> around it!
                    if cropout:
                        crops = cropout.split(" ")
                        # Regex replace?
                        if len(crops) == 2:
                            lid = re.sub(crops[0], crops[1], lid)
                        # Standard crop out?
                        else:
                            lid = lid.replace(cropout, "")
                    
                    date = message['date']
                    fro = message['from']
                    to = message['to']
                    body = msgbody(message)
                    try:
                        if 'content-type' in message and message['content-type'].find("flowed") != -1:
                            body = convertToWrapped(body, character_set="utf-8")
                        if isinstance(body, str):
                            body = body.encode('utf-8')
                    except Exception as err:
                        try:
                            body = body.decode(chardet.detect(body)['encoding'])
                        except Exception as err:
                            try:
                                body = body.decode('latin-1')
                            except:
                                try:
                                    if isinstance(body, str):
                                        body = body.encode('utf-8')
                                except:
                                    body = None

                    okay = True
                    dheader = {}
                    for key in ['to','from','subject','message-id']:
                        try:
                            hval = ""
                            if message.get(key):
                                for t in email.header.decode_header(message[key]):
                                    if t[1] == None or t[1].find("8bit") != -1:
                                        hval += t[0].decode('utf-8', errors='replace') if type(t[0]) is bytes else t[0]
                                    else:
                                        hval += t[0].decode(t[1],errors='ignore')
                                dheader[key] = hval
                            else:
                                dheader[key] = "(Unknown)"
                        except Exception as err:
                            print("Could not decode headers, ignoring..: %s" % err)
                            okay = False
                    mdt = ""
                    if not 'date' in message and 'received' in message:
                        print("No Date header found, resorting to Received")
                        m = re.search(r"(\d+ \S+ \d{4} \d\d:\d\d:\d\d ([-+]\d{4})?)", message['received'])
                        if m:
                            mdt = m.group(1)
                    else:
                        mdt = message['date']
                    mdate = None
                    uid_mdate = 0
                    try:
                        mdate = email.utils.parsedate_tz(mdt)
                        uid_mdate = email.utils.mktime_tz(mdate)
                    except:
                        pass
                    if not mdate or mdate[0] < (LEY-1):
                        print("Date is wrong or missing here, setting to %s" % ( LEY))
                        mdate = datetime.datetime(LEY, EM, 1).timetuple()
                    else:
                        LEY = mdate[0] # Gather evidence 'n'stuff!
                    mdatestring = ""
                    try:
                        mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(email.utils.mktime_tz(mdate)))
                    except:
                        okay = False
                    if body and okay and mdate and {'from','subject'} <= set(dheader):
                        # Pipermail transforms from: to something weird - reset that!
                        if piperWeirdness:
                            m = re.match(r"(.+) at ([^(]+) \((.+)\)$", dheader['from'])
                            # Try just 'foo at bar.tld' if 'foo at bar.tld (foo bar)' isn't working
                            if not m:
                                m = re.match(r"(.+) at ([^(]+)$", dheader['from'])
                            if m:
                                dheader['from'] = "%s <%s@%s>" % (m.group(3), m.group(1), m.group(2))
                                
                        attachments, contents = msgfiles(message)
                        if mid == None or not mid:
                            try:
                                mid = hashlib.sha256(body if type(body) is bytes else body.encode('ascii', errors='ignore')).hexdigest() + "@" + lid + "@" + appender
                            except:
                                if filebased:
                                    mid = hashlib.sha256("%f-%f-%s" % (random.random(), time.time(), filename) ).hexdigest()+ "@" + appender
                                else:
                                    mid = hashlib.sha256("%f-%f-%s-%s" % (random.random(), time.time(), ml, mboxfile) ).hexdigest()+ "@" + appender
                            print("No MID found, setting to %s" % mid)
                        mid2 = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
                        count += 1
                        mr = ""
                        if 'references' in message:
                            mr = message['references']
                        irt = ""
                        if 'in-reply-to' in message:
                            try:
                                irt = "".join(message['in-reply-to'])
                            except:
                                irt = message.get('in-reply-to').__str__()

                        json = {
                            'from_raw': dheader['from'],
                            'from': dheader['from'],
                            'to': dheader['to'],
                            'subject': dheader['subject'],
                            'cc': message.get('cc'),
                            'message-id': mid,
                            'mid': mid2,
                            'epoch': email.utils.mktime_tz(mdate),
                            'list': lid,
                            'list_raw': lid,
                            'date': mdatestring,
                            'private': private,
                            'references': mr,
                            'in-reply-to': irt,
                            'body': body.decode('utf-8', errors='replace') if type(body) is bytes else body,
                            'attachments': attachments
                        }
                        json_source = {
                            'mid': mid2,
                            'message-id': mid,
                            'source': message.as_bytes().decode('utf-8', errors='replace')
                        }
                        ja.append(json)
                        jas.append(json_source)
                        if contents:
                            iname = config.get("elasticsearch", "dbname")
                            if not args.dry:
                                for key in contents:
                                    es.index(
                                        index=iname,
                                        doc_type="attachment",
                                        id=key,
                                        body = {
                                            'source': contents[key]
                                        }
                                    )
                        if len(ja) >= 40:
                            if not args.dry:
                                bulk = BulkThread()
                                bulk.assign(ja, es, 'mbox')
                                bulk.insert()
                            ja = []
                            
                            if not args.dry:
                                bulks = BulkThread()
                                bulks.assign(jas, es, 'mbox_source')
                                bulks.insert()
                            jas = []
                else:
                    baddies += 1
            if filebased:
                print("Parsed %u records from %s" % (count, filename))
                if dFile:
                    os.unlink(tmpname)
            else:
                print("Parsed %s/%s: %u records from %s" % (ml, mboxfile, count, tmpname))
                os.unlink(tmpname)
                
            y += count
            if not args.dry:
                bulk = BulkThread()
                bulk.assign(ja, es)
                bulk.insert()
            ja = []
            
            if not args.dry:
                bulks = BulkThread()
                bulks.assign(jas, es, 'mbox_source')
                bulks.insert()
            jas = []
        print("Done, %u elements left to slurp" % len(lists))
示例#6
0
    def run(self):
        global block, y, es, lists, baddies, config, resendTo
        ja = []
        jas = []
        print("Thread started")
        mla = None
        ml = ""
        mboxfile = ""
        filename = ""
        xlist_override = None

        while len(lists) > 0:
            print("%u elements left to slurp" % len(lists))
            block.acquire()
            try:
                mla = lists.pop(0)
            except Exception as err:
                print("Could not pop list: %s" % err)
                block.release()
                return
            if not mla:
                print("Nothing more to do here")
                block.release()
                return
            block.release()
            y += 1
            EY = 1980
            EM = 1
            stime = time.time()
            if filebased:

                tmpname = mla[0]
                filename = mla[0]
                xlist_override = mla[1]
                print("Slurping %s" % filename)
            else:
                ml = mla[0]
                mboxfile = mla[1]
                xlist_override = list_override
                print("Slurping %s/%s" % (ml, mboxfile))
                m = re.match(r"(\d\d\d\d)(\d\d)", mboxfile)
                EY = 1997
                EM = 1
                if m:
                    EY = int(m.group(1))
                    EM = int(m.group(2))
                ctx = urlopen("%s%s/%s" % (source, ml, mboxfile))
                inp = ctx.read().decode(ctx.headers.get_content_charset()
                                        or 'utf-8',
                                        errors='ignore')

                tmpname = hashlib.sha224(
                    ("%f-%f-%s-%s.mbox" %
                     (random.random(), time.time(), ml,
                      mboxfile)).encode('utf-8')).hexdigest()
                with open(tmpname, "w") as f:
                    f.write(inp)
                    f.close()

            count = 0
            LEY = EY
            for message in mailbox.mbox(tmpname):
                if resendTo:
                    print("Delivering message %s via MTA" %
                          message['message-id'] if 'message-id' in
                          message else '??')
                    s = SMTP('localhost')
                    try:
                        if list_override:
                            message.replace_header('List-ID', list_override)
                        message.replace_header('To', resendTo)
                    except:
                        if list_override:
                            message['List-ID'] = list_override
                    message['cc'] = None
                    s.send_message(message,
                                   from_addr=None,
                                   to_addrs=(resendTo))
                    continue
                if (
                        time.time() - stime > 360
                ):  # break out after 6 minutes, it shouldn't take this long..!
                    print(
                        "Whoa, this is taking way too long, ignoring %s for now"
                        % tmpname)
                    break
                if 'subject' in message:
                    subject = message['subject']  # Could possibly be None.
                    mid = message['message-id']

                    lid = message['list-id']
                    if not lid or lid == "":  # Guess list name in absence
                        lid = '.'.join(reversed(
                            ml.split("-"))) + "." + appender

                    # Compact LID to <foo@domain>, discard rest
                    m = re.search(r"(<.+>)", lid)
                    if m:
                        lid = m.group(1)
                    if xlist_override and len(xlist_override) > 3:
                        lid = xlist_override
                    lid = lid.replace(
                        "@", ".")  # we want foo.bar.org, not [email protected]
                    lid = "<%s>" % lid.strip("<>")  # We need <> around it!
                    if cropout:
                        crops = cropout.split(" ")
                        # Regex replace?
                        if len(crops) == 2:
                            lid = re.sub(crops[0], crops[1], lid)
                        # Standard crop out?
                        else:
                            lid = lid.replace(cropout, "")

                    date = message['date']
                    fro = message['from']
                    to = message['to']
                    body = msgbody(message)
                    try:
                        if 'content-type' in message and message[
                                'content-type'].find("flowed") != -1:
                            body = convertToWrapped(body,
                                                    character_set="utf-8")
                        if isinstance(body, str):
                            body = body.encode('utf-8')
                    except Exception as err:
                        try:
                            body = body.decode(
                                chardet.detect(body)['encoding'])
                        except Exception as err:
                            try:
                                body = body.decode('latin-1')
                            except:
                                try:
                                    if isinstance(body, str):
                                        body = body.encode('utf-8')
                                except:
                                    body = None

                    okay = True
                    dheader = {}
                    for key in ['to', 'from', 'subject', 'message-id']:
                        try:
                            hval = ""
                            if message.get(key):
                                for t in email.header.decode_header(
                                        message[key]):
                                    if t[1] == None or t[1].find("8bit") != -1:
                                        hval += t[0].decode(
                                            'utf-8', errors='replace') if type(
                                                t[0]) is bytes else t[0]
                                    else:
                                        hval += t[0].decode(t[1],
                                                            errors='ignore')
                                dheader[key] = hval
                            else:
                                dheader[key] = "(Unknown)"
                        except Exception as err:
                            print("Could not decode headers, ignoring..: %s" %
                                  err)
                            okay = False
                    mdt = ""
                    if not 'date' in message and 'received' in message:
                        print("No Date header found, resorting to Received")
                        m = re.search(
                            r"(\d+ \S+ \d{4} \d\d:\d\d:\d\d ([-+]\d{4})?)",
                            message['received'])
                        if m:
                            mdt = m.group(1)
                    else:
                        mdt = message['date']
                    mdate = None
                    uid_mdate = 0
                    try:
                        mdate = email.utils.parsedate_tz(mdt)
                        uid_mdate = email.utils.mktime_tz(mdate)
                    except:
                        pass
                    if not mdate or mdate[0] < (LEY - 1):
                        print("Date is wrong or missing here, setting to %s" %
                              (LEY))
                        mdate = datetime.datetime(LEY, EM, 1).timetuple()
                    else:
                        LEY = mdate[0]  # Gather evidence 'n'stuff!
                    mdatestring = ""
                    try:
                        mdatestring = time.strftime(
                            "%Y/%m/%d %H:%M:%S",
                            time.localtime(email.utils.mktime_tz(mdate)))
                    except:
                        okay = False
                    if body and okay and mdate and {'from', 'subject'
                                                    } <= set(dheader):
                        # Pipermail transforms from: to something weird - reset that!
                        if piperWeirdness:
                            m = re.match(r"(.+) at ([^(]+) \((.+)\)$",
                                         dheader['from'])
                            # Try just 'foo at bar.tld' if 'foo at bar.tld (foo bar)' isn't working
                            if not m:
                                m = re.match(r"(.+) at ([^(]+)$",
                                             dheader['from'])
                            if m:
                                dheader['from'] = "%s <%s@%s>" % (
                                    m.group(3), m.group(1), m.group(2))

                        attachments, contents = msgfiles(message)
                        if mid == None or not mid:
                            try:
                                mid = hashlib.sha256(
                                    body if type(body) is bytes else body.
                                    encode('ascii', errors='ignore')
                                ).hexdigest() + "@" + lid + "@" + appender
                            except:
                                if filebased:
                                    mid = hashlib.sha256(
                                        "%f-%f-%s" %
                                        (random.random(), time.time(), filename
                                         )).hexdigest() + "@" + appender
                                else:
                                    mid = hashlib.sha256("%f-%f-%s-%s" % (
                                        random.random(), time.time(), ml,
                                        mboxfile)).hexdigest() + "@" + appender
                            print("No MID found, setting to %s" % mid)
                        mid2 = "%s@%s@%s" % (hashlib.sha224(
                            body if type(body) is bytes else body.
                            encode('ascii', 'ignore')).hexdigest(), uid_mdate,
                                             lid)
                        count += 1
                        mr = ""
                        if 'references' in message:
                            mr = message['references']
                        irt = ""
                        if 'in-reply-to' in message:
                            try:
                                irt = "".join(message['in-reply-to'])
                            except:
                                irt = message.get('in-reply-to').__str__()

                        json = {
                            'from_raw':
                            dheader['from'],
                            'from':
                            dheader['from'],
                            'to':
                            dheader['to'],
                            'subject':
                            dheader['subject'],
                            'cc':
                            message.get('cc'),
                            'message-id':
                            mid,
                            'mid':
                            mid2,
                            'epoch':
                            email.utils.mktime_tz(mdate),
                            'list':
                            lid,
                            'list_raw':
                            lid,
                            'date':
                            mdatestring,
                            'private':
                            private,
                            'references':
                            mr,
                            'in-reply-to':
                            irt,
                            'body':
                            body.decode('utf-8', errors='replace')
                            if type(body) is bytes else body,
                            'attachments':
                            attachments
                        }
                        json_source = {
                            'mid':
                            mid2,
                            'message-id':
                            mid,
                            'source':
                            message.as_bytes().decode('utf-8',
                                                      errors='replace')
                        }
                        ja.append(json)
                        jas.append(json_source)
                        if contents:
                            iname = config.get("elasticsearch", "dbname")
                            if not args.dry:
                                for key in contents:
                                    es.index(index=iname,
                                             doc_type="attachment",
                                             id=key,
                                             body={'source': contents[key]})
                        if len(ja) >= 40:
                            if not args.dry:
                                bulk = BulkThread()
                                bulk.assign(ja, es, 'mbox')
                                bulk.insert()
                            ja = []

                            if not args.dry:
                                bulks = BulkThread()
                                bulks.assign(jas, es, 'mbox_source')
                                bulks.insert()
                            jas = []
                else:
                    baddies += 1
            if filebased:
                print("Parsed %u records from %s" % (count, filename))
            else:
                print("Parsed %s/%s: %u records from %s" %
                      (ml, mboxfile, count, tmpname))
                os.unlink(tmpname)

            y += count
            if not args.dry:
                bulk = BulkThread()
                bulk.assign(ja, es)
                bulk.insert()
            ja = []

            if not args.dry:
                bulks = BulkThread()
                bulks.assign(jas, es, 'mbox_source')
                bulks.insert()
            jas = []
        print("Done, %u elements left to slurp" % len(lists))