def prethread_index_factory_new(): threader = lazythread_container() print '%s - creating msg containers from raw mail' % datetime.now() t = time.time() all_msgs = (msg_factory(x) for x in mail_grab.iteritems()) #all_msgs = forkmap.map(msg_factory, mail_grab.iteritems()) t = time.time() - t print "done! took %r seconds" % t print '%s - building conversation objects' % datetime.now() t = time.time() all_msgs = (conv_factory(x) for x in all_msgs) t = time.time() - t print "done! took %r seconds" % t print '%s - threading messages into conversations' % datetime.now() t = time.time() threader.thread(all_msgs) t = time.time() - t print "done! took %r seconds" % t print '%s - running integrity checker' % datetime.now() t = time.time() docs = _ensure_threading_integrity(threader, True) t = time.time() - t print "done! took %r seconds" % t print '%s - queueing docs' % datetime.now() t = time.time() map(xconn.replace, docs) xconn.flush() t = time.time() - t print "done! took %r seconds" % t print "%s - waiting for work to finish" % datetime.now()
def _preindex_thread(msgs): tracker = {} for msg in msgs: if msg.thread: raise ValueError('This is weird. why am i trying to thread something that already has a thread id?') tracker[msg.msgid[0]] = msg threader = lazythread_container() all_msgs = iterdocs() if all_msgs: print 'yes to all_msgs' all_msgs = map(msg_factory, all_msgs) all_msgs = threadmap.map(conv_factory, all_msgs) threader.thread(all_msgs) print 'threader b4 convs %s' % len(threader) threader.thread( threadmap.map(conv_factory, msgs) ) print 'threader after convs %s' % len(threader) c = 0 ct = 0 for conv in threader: ct+= len(conv.messages) if len(conv.messages) == 1: c+=1 print "found %s threads with one msg" % c print "threader contains %s messages out of a total %s" % (ct, len(msgs)) c = 0 if all_msgs: for key in tracker: msg = tracker[key] try: try: conv = threader[key] except: conv = threader[msg.subject[0]] except: c+=1 ''' print "unable to find msg in thread" print key print msg.subject[0] ''' continue threadid = conv.thread msg.thread.extend(threadid) print "%s messages didn't find a thread" % c return msgs, threader
def _ensure_threading_integrity(threader=None, all_new=False): if not threader: threader = lazythread_container() all_msgs = (msg_factory(x) for x in iterdocs()) #all_msgs = (msg_factory(x) for x in iterdocs(safe=True)) all_msgs = (conv_factory(x) for x in all_msgs) threader.thread(all_msgs) to_update = [] to_replace = [] def ctid_to_mtid(conv): ctid = conv.thread for msg in conv.messages: id_data_tple = (msg, [('thread', ctid)]) #optimization: pass msg_container so we don't have to rebuild it again #id_data_tple = (msg.muuid, [('thread', ctid)]) if not msg.thread: to_update.append(id_data_tple) elif ctid != msg.thread: to_replace.append(id_data_tple) map(ctid_to_mtid, threader) print "in update queue %i" % len(to_update) print "in replace queue %i" % len(to_replace) print '%s - starting modify factory on to_update' % datetime.now() docs1 = modify_factory(to_update, update_existing, all_new) print '%s - starting modify factory on to_replace' % datetime.now() docs2 = modify_factory(to_replace, replace_existing, all_new) def chn_gen(gg): it = gg.next() while 1: try: r = it.next() except StopIteration: try: it = gg.next() continue except StopIteration: break yield r docs = chn_gen( (x for x in [docs1, docs2]) ) return docs