Пример #1
0
def query_by_vec_online(table, vec, top_k=5):
    init_word2vec()
    db = EmailDB(table)
    scores = []
    for email in db.select():
        if email.message_id in CACHE:
            candidate_vec = CACHE[email.message_id]
        else:
            doc = body2doc(email.body)
            print "[cache doc]", doc
            candidate_vec = avg_word_vec(doc)
            CACHE[email.message_id] = candidate_vec
        score = np.dot(vec, candidate_vec) / npla.norm(vec) / npla.norm(candidate_vec)  # cosine similarity.
        score = float(score)
        scores.append((email, score))

    scores = [pair for pair in scores if pair[1] == pair[1]]
    scores = sorted(scores, key=lambda pair: pair[1], reverse=True)

    print "[matched]", scores[:top_k]
    return scores[:top_k]
Пример #2
0
# implement autoreply to emails.
from surrobot.core.mail2vec import avg_word_vec, init_word2vec
from surrobot.core.preprocess import body2doc
from surrobot.db.embedding import query_by_vec_online
from surrobot.db.db import EmailDB

from pprint import pprint

init_word2vec()

def query_by_body(body, top_k=5):
    ''' query top_k most probable replies based on raw email body '''
    doc = body2doc(body)
    print '[doc]', doc
    vec = avg_word_vec(doc)
    # retrieve top-k past inquery emails.
    candidates = query_by_vec_online('inbox', vec, top_k)
    # get past replies.
    db = EmailDB('outbox')
    replies = []
    for (email, score) in candidates:
        reply = db.select(
                    where='message_id=:message_id',
                    data=dict(message_id=email.message_id)
                )
        reply = list(reply)[0].body
        replies.append((reply, score))
    return replies


if __name__ == '__main__':