def handle(self, *args, **options): self._check_options(args, options) setup_logging(self, options["verbosity"]) # main list_address = options["list_address"].lower() ## Keep autocommit on SQLite: ## https://docs.djangoproject.com/en/1.6/topics/db/transactions/#savepoints-in-sqlite #if settings.DATABASES["default"]["ENGINE"] != "django.db.backends.sqlite3": # transaction.set_autocommit(False) settings.HYPERKITTY_BATCH_MODE = True # Only import emails older than the latest email in the DB latest_email_date = Email.objects.filter( mailinglist__name=list_address ).values("date").order_by("-date").first() if latest_email_date and not options["since"]: options["since"] = latest_email_date["date"] if options["since"] and options["verbosity"] >= 2: self.stdout.write("Only emails after %s will be imported" % options["since"]) importer = DbImporter(list_address, options, self.stdout, self.stderr) # disable mailman client for now for mbfile in args: if options["verbosity"] >= 1: self.stdout.write("Importing from mbox file %s to %s" % (mbfile, list_address)) if not options["ignore_mtime"] and options["since"] is not None: mtime = datetime.fromtimestamp( os.path.getmtime(mbfile), tz.tzlocal()) if mtime <= options["since"]: if options["verbosity"] >= 2: self.stdout.write('Mailbox file for %s is too old' % list_address) continue importer.from_mbox(mbfile) if options["verbosity"] >= 2: total_in_list = Email.objects.filter( mailinglist__name=list_address).count() self.stdout.write(' %s emails are stored into the database' % total_in_list) #timeit("start") if options["verbosity"] >= 1: self.stdout.write("Computing thread structure") for thread in Thread.objects.filter( id__in=importer.impacted_thread_ids): #timeit("before") compute_thread_order_and_depth(thread) #timeit("after") #showtimes() if not options["no_sync_mailman"]: if options["verbosity"] >= 1: self.stdout.write("Synchronizing properties with Mailman") sync_with_mailman() #if not transaction.get_autocommit(): # transaction.commit() if options["verbosity"] >= 1: self.stdout.write( "The full-text search index will be updated every minute. Run " "the 'manage.py runjob update_index' command to update it now." )
def _compute_thread_positions(thread_id): try: thread = Thread.objects.get(id=thread_id) except Thread.DoesNotExist: # Maybe the thread was deleted? Not much we can do here. log.warning( "Cannot rebuild the thread cache: thread %s does not exist.", thread_id) return compute_thread_order_and_depth(thread)
def Email_update_or_clean_thread(sender, **kwargs): email = kwargs["instance"] try: thread = Thread.objects.get(id=email.thread_id) except Thread.DoesNotExist: return if thread.emails.count() == 0: thread.delete() else: compute_thread_order_and_depth(thread)
def on_email_deleted(self, email): from hyperkitty.tasks import rebuild_thread_cache_new_email # update or cleanup thread if self.emails.count() == 0: self.delete() else: if self.starting_email is None: self.find_starting_email() self.save(update_fields=["starting_email"]) compute_thread_order_and_depth(self) self.date_active = self.emails.order_by("-date").first().date rebuild_thread_cache_new_email(self.id)
def on_post_delete(self): # refresh the count cache self._refresh_count_cache() # update_or_clean_thread try: thread = Thread.objects.get(id=self.thread_id) except Thread.DoesNotExist: return if thread.emails.count() == 0: thread.delete() else: if thread.starting_email is None: thread.find_starting_email() thread.save(update_fields=["starting_email"]) compute_thread_order_and_depth(thread)
def test_reply_to_oneself(self): # A message replying to itself (yes, it's been spotted in the wild) thread = Thread.objects.create(mailinglist=self.mlist, thread_id="msg1") msg1 = self.make_fake_email(1) msg1.save() thread.starting_email = msg1 thread.save() msg1.parent = msg1 msg1.thread_order = msg1.thread_depth = 42 msg1.save() compute_thread_order_and_depth(thread) msg1 = Email.objects.get(id=msg1.id) # Don't traceback with a "maximum recursion depth exceeded" error self.assertEqual(msg1.thread_order, 0) self.assertEqual(msg1.thread_depth, 0)
def test_reply_loops(self): """Loops in message replies""" # This implies that someone replies to a message not yet sent, but you # never know, Dr Who can be on your mailing-list. thread = Thread.objects.create(mailinglist=self.mlist, thread_id="msg1") msg1 = self.make_fake_email(1, thread=thread) msg1.save() thread.starting_email = msg1 thread.save() msg2 = self.make_fake_email(2, thread=thread) msg2.parent = msg1 msg2.save() msg1.parent = msg2 msg1.save() compute_thread_order_and_depth(thread)
def on_post_delete(self): # refresh the count cache self._refresh_count_cache() # update_or_clean_thread try: thread = Thread.objects.get(id=self.thread_id) except Thread.DoesNotExist: return if thread.emails.count() == 0: thread.delete() else: if thread.starting_email is None: thread.find_starting_email() thread.save(update_fields=["starting_email"]) compute_thread_order_and_depth(thread)
def test_reply_loops(self): """Loops in message replies""" # This implies that someone replies to a message not yet sent, but you # never know, Dr Who can be on your mailing-list. thread = Thread.objects.create( mailinglist=self.mlist, thread_id="msg1") msg1 = self.make_fake_email(1, thread=thread) msg1.save() thread.starting_email = msg1 thread.save() msg2 = self.make_fake_email(2, thread=thread) msg2.parent = msg1 msg2.save() msg1.parent = msg2 msg1.save() compute_thread_order_and_depth(thread)
def test_reply_to_oneself(self): # A message replying to itself (yes, it's been spotted in the wild) thread = Thread.objects.create( mailinglist=self.mlist, thread_id="msg1") msg1 = self.make_fake_email(1) msg1.save() thread.starting_email = msg1 thread.save() msg1.parent = msg1 msg1.thread_order = msg1.thread_depth = 42 msg1.save() compute_thread_order_and_depth(thread) msg1 = Email.objects.get(id=msg1.id) # Don't traceback with a "maximum recursion depth exceeded" error self.assertEqual(msg1.thread_order, 0) self.assertEqual(msg1.thread_depth, 0)
def test_classical_thread(self): # msg1 # |-msg2 # | `-msg4 # `-msg3 thread = Thread.objects.create(mailinglist=self.mlist, thread_id="msg1") # All in the same thread msg1 = self.make_fake_email(1, thread=thread) msg2 = self.make_fake_email(2, thread=thread) msg3 = self.make_fake_email(3, thread=thread) msg4 = self.make_fake_email(4, thread=thread) # Set up the reply tree msg1.save() thread.starting_email = msg1 thread.save() msg2.parent = msg3.parent = msg1 msg2.save() msg3.save() msg4.parent = msg2 msg4.save() # Init with false values msg1.thread_order = ( msg1.thread_depth ) = ( msg2.thread_order ) = msg2.thread_depth = msg3.thread_order = msg3.thread_depth = msg4.thread_order = msg4.thread_depth = 42 msg1.save() msg2.save() msg3.save() msg4.save() compute_thread_order_and_depth(thread) msg1 = Email.objects.get(id=msg1.id) msg2 = Email.objects.get(id=msg2.id) msg3 = Email.objects.get(id=msg3.id) msg4 = Email.objects.get(id=msg4.id) self.assertEqual(msg1.thread_order, 0) self.assertEqual(msg1.thread_depth, 0) self.assertEqual(msg2.thread_order, 1) self.assertEqual(msg2.thread_depth, 1) self.assertEqual(msg3.thread_order, 3) self.assertEqual(msg3.thread_depth, 1) self.assertEqual(msg4.thread_order, 2) self.assertEqual(msg4.thread_depth, 2)
def test_classical_thread(self): # msg1 # |-msg2 # | `-msg4 # `-msg3 thread = Thread.objects.create( mailinglist=self.mlist, thread_id="msg1") # All in the same thread msg1 = self.make_fake_email(1, thread=thread) msg2 = self.make_fake_email(2, thread=thread) msg3 = self.make_fake_email(3, thread=thread) msg4 = self.make_fake_email(4, thread=thread) # Set up the reply tree msg1.save() thread.starting_email = msg1 thread.save() msg2.parent = msg3.parent = msg1 msg2.save() msg3.save() msg4.parent = msg2 msg4.save() # Init with false values msg1.thread_order = msg1.thread_depth = \ msg2.thread_order = msg2.thread_depth = \ msg3.thread_order = msg3.thread_depth = \ msg4.thread_order = msg4.thread_depth = 42 msg1.save() msg2.save() msg3.save() msg4.save() compute_thread_order_and_depth(thread) msg1 = Email.objects.get(id=msg1.id) msg2 = Email.objects.get(id=msg2.id) msg3 = Email.objects.get(id=msg3.id) msg4 = Email.objects.get(id=msg4.id) self.assertEqual(msg1.thread_order, 0) self.assertEqual(msg1.thread_depth, 0) self.assertEqual(msg2.thread_order, 1) self.assertEqual(msg2.thread_depth, 1) self.assertEqual(msg3.thread_order, 3) self.assertEqual(msg3.thread_depth, 1) self.assertEqual(msg4.thread_order, 2) self.assertEqual(msg4.thread_depth, 2)
def test_simple_thread(self): # A basic thread: msg2 replies to msg1 thread = Thread.objects.create(mailinglist=self.mlist, thread_id="msg1") msg1 = self.make_fake_email(1, thread=thread) msg1.thread_order = msg1.thread_depth = 42 msg1.save() thread.starting_email = msg1 thread.save() msg2 = self.make_fake_email(2, thread=thread) msg2.parent = msg1 msg2.thread_order = msg2.thread_depth = 42 msg2.save() compute_thread_order_and_depth(thread) # Must reload from the database msg1 = Email.objects.get(id=msg1.id) msg2 = Email.objects.get(id=msg2.id) self.assertEqual(msg1.thread_order, 0) self.assertEqual(msg1.thread_depth, 0) self.assertEqual(msg2.thread_order, 1) self.assertEqual(msg2.thread_depth, 1)
def test_simple_thread(self): # A basic thread: msg2 replies to msg1 thread = Thread.objects.create( mailinglist=self.mlist, thread_id="msg1") msg1 = self.make_fake_email(1, thread=thread) msg1.thread_order = msg1.thread_depth = 42 msg1.save() thread.starting_email = msg1 thread.save() msg2 = self.make_fake_email(2, thread=thread) msg2.parent = msg1 msg2.thread_order = msg2.thread_depth = 42 msg2.save() compute_thread_order_and_depth(thread) # Must reload from the database msg1 = Email.objects.get(id=msg1.id) msg2 = Email.objects.get(id=msg2.id) self.assertEqual(msg1.thread_order, 0) self.assertEqual(msg1.thread_depth, 0) self.assertEqual(msg2.thread_order, 1) self.assertEqual(msg2.thread_depth, 1)
def set_parent(self, parent): if self.id == parent.id: raise ValueError("An email can't be its own parent") # Compute the subthread subthread = [self] def _collect_children(current_email): children = list(current_email.children.all()) if not children: return subthread.extend(children) for child in children: _collect_children(child) _collect_children(self) # now set my new parent value old_parent_id = self.parent_id self.parent = parent self.save(update_fields=["parent_id"]) # If my future parent is in my current subthread, I need to set its # parent to my current parent if parent in subthread: parent.parent_id = old_parent_id parent.save(update_fields=["parent_id"]) # do it after setting the new parent_id to avoid having two # parent_ids set to None at the same time (IntegrityError) if self.thread_id != parent.thread_id: # we changed the thread, reattach the subthread former_thread = self.thread for child in subthread: child.thread = parent.thread child.save(update_fields=["thread_id"]) if child.date > parent.thread.date_active: parent.thread.date_active = child.date parent.thread.save() # if we were the starting email, or former thread may be empty if former_thread.emails.count() == 0: former_thread.delete() compute_thread_order_and_depth(parent.thread)
def set_parent(self, parent): if self.id == parent.id: raise ValueError("An email can't be its own parent") # Compute the subthread subthread = [self] def _collect_children(current_email): children = list(current_email.children.all()) if not children: return subthread.extend(children) for child in children: _collect_children(child) _collect_children(self) # now set my new parent value old_parent_id = self.parent_id self.parent = parent self.save(update_fields=["parent_id"]) # If my future parent is in my current subthread, I need to set its # parent to my current parent if parent in subthread: parent.parent_id = old_parent_id parent.save(update_fields=["parent_id"]) # do it after setting the new parent_id to avoid having two # parent_ids set to None at the same time (IntegrityError) if self.thread_id != parent.thread_id: # we changed the thread, reattach the subthread former_thread = self.thread for child in subthread: child.thread = parent.thread child.save(update_fields=["thread_id"]) if child.date > parent.thread.date_active: parent.thread.date_active = child.date parent.thread.save() # if we were the starting email, or former thread may be empty if former_thread.emails.count() == 0: former_thread.delete() compute_thread_order_and_depth(parent.thread)
def execute(self): for thread in Thread.objects.all(): compute_thread_order_and_depth(thread)
def add_to_list(list_name, message): # timeit("1 start") mlist = MailingList.objects.get_or_create(name=list_name)[0] if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): mlist.update_from_mailman() mlist.save() if mlist.archive_policy == ArchivePolicy.never.value: logger.info("Archiving disabled by list policy for %s", list_name) return if "Message-Id" not in message: raise ValueError("No 'Message-Id' header in email", message) # timeit("2 after ml, before checking email & sender") msg_id = get_message_id(message) if Email.objects.filter(mailinglist=mlist, message_id=msg_id).exists(): raise DuplicateMessage(msg_id) email = Email(mailinglist=mlist, message_id=msg_id) email.in_reply_to = get_ref(message) # Find thread id if message.get_unixfrom() is not None: mo = UNIXFROM_DATE_RE.match(message.get_unixfrom()) if mo: archived_date = parsedate(mo.group(1)) if archived_date is not None: email.archived_date = archived_date # Sender try: from_name, from_email = parseaddr(message['From']) from_name = header_to_unicode(from_name).strip() sender_address = from_email.decode("ascii").strip() except (UnicodeDecodeError, UnicodeEncodeError): raise ValueError("Non-ascii sender address", message) if not sender_address: if from_name: sender_address = re.sub("[^a-z0-9]", "", from_name.lower()) if not sender_address: sender_address = "unknown" sender_address = "{}@example.com".format(sender_address) else: sender_address = "*****@*****.**" email.sender_name = from_name sender = Sender.objects.get_or_create(address=sender_address)[0] email.sender = sender if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): try: sender.set_mailman_id() except MailmanConnectionError: pass # timeit("3 after sender, before email content") # Headers email.subject = header_to_unicode(message.get('Subject')) if email.subject is not None: # limit subject size to 512, it's a varchar field email.subject = email.subject[:512] msg_date = parsedate(message.get("Date")) if msg_date is None: # Absent or unparseable date msg_date = timezone.now() utcoffset = msg_date.utcoffset() if msg_date.tzinfo is not None: msg_date = msg_date.astimezone(timezone.utc) # store in UTC email.date = msg_date if utcoffset is None: email.timezone = 0 else: # in minutes email.timezone = int( ((utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60) # Content scrubber = Scrubber(message) # warning: scrubbing modifies the msg in-place email.content, attachments = scrubber.scrub() # timeit("4 after email content, before signals") # TODO: detect category? # Set or create the Thread if email.in_reply_to is not None: try: ref_msg = Email.objects.get(mailinglist=email.mailinglist, message_id=email.in_reply_to) except Email.DoesNotExist: # the parent may not be archived (on partial imports), create a new # thread for now. pass else: # re-use parent's thread-id email.parent = ref_msg email.thread_id = ref_msg.thread_id thread = ref_msg.thread thread_created = False if email.thread_id is None: # Create the thread if not found thread, thread_created = Thread.objects.get_or_create( mailinglist=email.mailinglist, thread_id=email.message_id_hash) email.thread = thread email.save() # must save before setting the thread.starting_email thread.date_active = email.date if thread_created: thread.starting_email = email thread.save() if thread_created: new_thread.send("Mailman", thread=thread) # signal_results = new_thread.send_robust("Mailman", thread=thread) # for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_thread' to {} raised an " # "exception: {}".format( # receiver.func_name, result)) # Signals new_email.send("Mailman", email=email) # signal_results = new_email.send_robust("Mailman", email=email) # for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_email' to {} raised an exception: {}".format( # receiver.func_name, result)) # #logger.exception(result) # #from traceback import print_exc; print_exc(result) # timeit("5 after signals, before save") # timeit("6 after save") # compute thread props here because email must have been saved before # (there will be DB queries in this function) if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): compute_thread_order_and_depth(email.thread) # Attachments (email must have been saved before) for attachment in attachments: counter, name, content_type, encoding, content = attachment if Attachment.objects.filter(email=email, counter=counter).exists(): continue Attachment.objects.create(email=email, counter=counter, name=name, content_type=content_type, encoding=encoding, content=content) return email.message_id_hash
def execute(self): for thread in Thread.objects.all(): compute_thread_order_and_depth(thread)
def add_to_list(list_name, message): # timeit("1 start") mlist = MailingList.objects.get_or_create(name=list_name)[0] if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): mlist.update_from_mailman() mlist.save() if mlist.archive_policy == ArchivePolicy.never.value: logger.info("Archiving disabled by list policy for %s", list_name) return if not message.has_key("Message-Id"): raise ValueError("No 'Message-Id' header in email", message) # timeit("2 after ml, before checking email & sender") msg_id = get_message_id(message) if Email.objects.filter(mailinglist=mlist, message_id=msg_id).exists(): raise DuplicateMessage(msg_id) email = Email(mailinglist=mlist, message_id=msg_id) email.in_reply_to = get_ref(message) # Find thread id # Sender try: from_name, from_email = parseaddr(message["From"]) from_name = header_to_unicode(from_name).strip() sender_address = from_email.decode("ascii").strip() except (UnicodeDecodeError, UnicodeEncodeError): raise ValueError("Non-ascii sender address", message) if not sender_address: if from_name: sender_address = re.sub("[^a-z0-9]", "", from_name.lower()) if not sender_address: sender_address = "unknown" sender_address = "{}@example.com".format(sender_address) else: sender_address = "*****@*****.**" sender = Sender.objects.get_or_create(address=sender_address)[0] sender.name = from_name # update the name if needed sender.save() email.sender = sender if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): set_sender_mailman_id(sender) # timeit("3 after sender, before email content") # Headers email.subject = header_to_unicode(message.get("Subject")) if email.subject is not None: # limit subject size to 512, it's a varchar field email.subject = email.subject[:512] msg_date = parsedate(message.get("Date")) if msg_date is None: # Absent or unparseable date msg_date = timezone.now() utcoffset = msg_date.utcoffset() if msg_date.tzinfo is not None: msg_date = msg_date.astimezone(timezone.utc) # store in UTC email.date = msg_date if utcoffset is None: email.timezone = 0 else: # in minutes email.timezone = int(((utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60) # Content scrubber = Scrubber(list_name, message) # warning: scrubbing modifies the msg in-place email.content, attachments = scrubber.scrub() # timeit("4 after email content, before signals") # TODO: detect category? # Set or create the Thread if email.in_reply_to is not None: try: ref_msg = Email.objects.get(mailinglist=email.mailinglist, message_id=email.in_reply_to) except Email.DoesNotExist: # the parent may not be archived (on partial imports), create a new # thread for now. pass else: # re-use parent's thread-id email.parent = ref_msg email.thread_id = ref_msg.thread_id ref_msg.thread.date_active = email.date ref_msg.thread.save() thread_created = False if email.thread_id is None: # Create the thread if not found thread = Thread.objects.create( mailinglist=email.mailinglist, thread_id=email.message_id_hash, date_active=email.date ) thread_created = True email.thread = thread email.save() # must save before setting the thread.starting_email if thread_created: thread.starting_email = email thread.save() new_thread.send("Mailman", thread=thread) # signal_results = new_thread.send_robust("Mailman", thread=thread) # for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_thread' to {} raised an exception: {}".format( # receiver.func_name, result)) # Signals new_email.send("Mailman", email=email) # signal_results = new_email.send_robust("Mailman", email=email) # for receiver, result in signal_results: # if isinstance(result, Exception): # logger.warning( # "Signal 'new_email' to {} raised an exception: {}".format( # receiver.func_name, result)) # #logger.exception(result) # #from traceback import print_exc; print_exc(result) # timeit("5 after signals, before save") # timeit("6 after save") # compute thread props here because email must have been saved before # (there will be DB queries in this function) if not getattr(settings, "HYPERKITTY_BATCH_MODE", False): compute_thread_order_and_depth(email.thread) # Attachments (email must have been saved before) for attachment in attachments: counter, name, content_type, encoding, content = attachment if Attachment.objects.filter(email=email, counter=counter).exists(): continue Attachment.objects.create( email=email, counter=counter, name=name, content_type=content_type, encoding=encoding, content=content ) return email.message_id_hash
def handle(self, *args, **options): self._check_options(options) setup_logging(self, options["verbosity"]) # main list_address = options["list_address"].lower() # Keep autocommit on SQLite: # https://docs.djangoproject.com/en/1.8/topics/db/transactions/#savepoints-in-sqlite # if (settings.DATABASES["default"]["ENGINE"] # != "django.db.backends.sqlite3": # transaction.set_autocommit(False) settings.HYPERKITTY_BATCH_MODE = True # Only import emails newer than the latest email in the DB latest_email_date = Email.objects.filter( mailinglist__name=list_address).values("date").order_by( "-date").first() if latest_email_date and not options["since"]: options["since"] = latest_email_date["date"] if options["since"] and options["verbosity"] >= 2: self.stdout.write("Only emails after %s will be imported" % options["since"]) importer = DbImporter(list_address, options, self.stdout, self.stderr) # disable mailman client for now for mbfile in options["mbox"]: if options["verbosity"] >= 1: self.stdout.write("Importing from mbox file %s to %s" % (mbfile, list_address)) if not options["ignore_mtime"] and options["since"] is not None: mtime = datetime.fromtimestamp(os.path.getmtime(mbfile), tz.tzlocal()) if mtime <= options["since"]: if options["verbosity"] >= 2: self.stdout.write('Mailbox file for %s is too old' % list_address) continue importer.from_mbox(mbfile) if options["verbosity"] >= 2: total_in_list = Email.objects.filter( mailinglist__name=list_address).count() self.stdout.write(' %s emails are stored into the database' % total_in_list) if options["verbosity"] >= 1: self.stdout.write("Computing thread structure") # Work on batches of thread ids to avoid creating a huge SQL request # (it's an IN statement) thread_ids = list(importer.impacted_thread_ids) while thread_ids: thread_ids_batch = thread_ids[:100] thread_ids = thread_ids[100:] for thread in Thread.objects.filter(id__in=thread_ids_batch): compute_thread_order_and_depth(thread) if not options["no_sync_mailman"]: if options["verbosity"] >= 1: self.stdout.write("Synchronizing properties with Mailman") sync_with_mailman() # if not transaction.get_autocommit(): # transaction.commit() if options["verbosity"] >= 1: self.stdout.write("Warming up cache") call_command("hyperkitty_warm_up_cache", list_address) if options["verbosity"] >= 1: self.stdout.write( "The full-text search index is not updated for this list. " "It will not be updated by the 'minutely' incremental " "update job. To update the index for this list, run the " "'manage.py update_index_one_list {}' command.".format( list_address))