示例#1
0
 def add_batch(self, documents):
     """
     See http://pythonhosted.org/Whoosh/batch.html
     """
     logger.info("Indexing all messages")
     # Don't use optimizations below, it will eat up lots of memory and can
     # go as far as preventing forking (OSError), tested on a 3GB VM with
     # the Fedora archives
     #writer = self.index.writer(limitmb=256, procs=4, multisegment=True)
     writer = self.index.writer(multisegment=True)
     # remove the LRU cache limit from the stemanalyzer
     for component in writer.schema["content"].analyzer:
         try:
             component.cachesize = -1
             component.clear()
         except AttributeError:
             continue
     try:
         total = len(documents)
     except TypeError: # it's a ResultSet
         total = documents.count()
     try:
         for num, doc in enumerate(documents):
             if IMessage.providedBy(doc):
                 doc = email_to_search_doc(doc)
             writer.add_document(**doc)
             if num % 1000 == 0:
                 logger.info("...still indexing (%d/%d)..." % (num, total))
     except Exception:
         writer.cancel()
         raise
     else:
         writer.commit()
示例#2
0
 def add(self, doc):
     writer = self.index.writer()
     if IMessage.providedBy(doc):
         doc = email_to_search_doc(doc)
     try:
         writer.add_document(**doc)
     except Exception:
         writer.cancel()
         raise
     else:
         writer.commit()
示例#3
0
def email_to_search_doc(email):
    if not IMessage.providedBy(email):
        raise ValueError("not an instance of the Email class")
    private_list = (email.mlist.archive_policy == ArchivePolicy.private)
    search_doc = {
            "list_name": email.list_name,
            "message_id": email.message_id,
            "sender": u"%s %s" % (email.sender_name, email.sender_email),
            "subject": email.subject,
            "content": email.content,
            "date": email.date, # UTC
            "private_list": private_list,
    }
    user_id = email.sender.user_id
    if user_id is not None:
        user_id = unicode(user_id.int)
    search_doc["user_id"] = user_id
    attachments = [a.name for a in email.attachments]
    if attachments:
        search_doc["attachments"] = " ".join(attachments)
    return search_doc