def query_by_body(body, top_k=5): ''' query top_k most probable replies based on raw email body ''' doc = body2doc(body) print '[doc]', doc vec = avg_word_vec(doc) # retrieve top-k past inquery emails. candidates = query_by_vec_online('inbox', vec, top_k) # get past replies. db = EmailDB('outbox') replies = [] for (email, score) in candidates: reply = db.select( where='message_id=:message_id', data=dict(message_id=email.message_id) ) reply = list(reply)[0].body replies.append((reply, score)) return replies
def query_by_vec_online(table, vec, top_k=5): init_word2vec() db = EmailDB(table) scores = [] for email in db.select(): if email.message_id in CACHE: candidate_vec = CACHE[email.message_id] else: doc = body2doc(email.body) print "[cache doc]", doc candidate_vec = avg_word_vec(doc) CACHE[email.message_id] = candidate_vec score = np.dot(vec, candidate_vec) / npla.norm(vec) / npla.norm(candidate_vec) # cosine similarity. score = float(score) scores.append((email, score)) scores = [pair for pair in scores if pair[1] == pair[1]] scores = sorted(scores, key=lambda pair: pair[1], reverse=True) print "[matched]", scores[:top_k] return scores[:top_k]
def sync(csv_path): inbox_db = EmailDB("inbox") outbox_db = EmailDB("outbox") in_mails = [] out_mails = [] for (in_mail, out_mail) in extract(csv_path): in_mails.append(in_mail) out_mails.append(out_mail) def get_in_mail(): for (mi, in_mail) in enumerate(in_mails): print "[inbox][%d/%d]" % (mi, len(in_mails)) yield in_mail def get_out_mail(): for (mi, out_mail) in enumerate(out_mails): print "[outbox][%d/%d]" % (mi, len(out_mails)) yield out_mail inbox_db.update_all("*****@*****.**", get_in_mail()) outbox_db.update_all("*****@*****.**", get_out_mail())