def add_batch(self, documents): """ See http://pythonhosted.org/Whoosh/batch.html """ logger.info("Indexing all messages") # Don't use optimizations below, it will eat up lots of memory and can # go as far as preventing forking (OSError), tested on a 3GB VM with # the Fedora archives #writer = self.index.writer(limitmb=256, procs=4, multisegment=True) writer = self.index.writer(multisegment=True) # remove the LRU cache limit from the stemanalyzer for component in writer.schema["content"].analyzer: try: component.cachesize = -1 component.clear() except AttributeError: continue try: total = len(documents) except TypeError: # it's a ResultSet total = documents.count() try: for num, doc in enumerate(documents): if IMessage.providedBy(doc): doc = email_to_search_doc(doc) writer.add_document(**doc) if num % 1000 == 0: logger.info("...still indexing (%d/%d)..." % (num, total)) except Exception: writer.cancel() raise else: writer.commit()
def add(self, doc): writer = self.index.writer() if IMessage.providedBy(doc): doc = email_to_search_doc(doc) try: writer.add_document(**doc) except Exception: writer.cancel() raise else: writer.commit()
def email_to_search_doc(email): if not IMessage.providedBy(email): raise ValueError("not an instance of the Email class") private_list = (email.mlist.archive_policy == ArchivePolicy.private) search_doc = { "list_name": email.list_name, "message_id": email.message_id, "sender": u"%s %s" % (email.sender_name, email.sender_email), "subject": email.subject, "content": email.content, "date": email.date, # UTC "private_list": private_list, } user_id = email.sender.user_id if user_id is not None: user_id = unicode(user_id.int) search_doc["user_id"] = user_id attachments = [a.name for a in email.attachments] if attachments: search_doc["attachments"] = " ".join(attachments) return search_doc