def get_message_weight(self,body): body_tdm = SimpleTermDocumentMatrix() body_tdm.add_doc(body) body_terms = body_tdm.get_terms(min_doc_freq=1) tm = self.tdm.get_term_freq() for body_term in body_terms: tm_log10 = [math.log10(tm[k]) for k in tm if body_term == k] mean = sum(tm_log10)/len(tm_log10) if len(tm_log10) > 0 else 1 return mean
def get_tdm(email_path) : if os.path.isdir(email_path): files = [ email_path + "/" + f for f in os.listdir(email_path) if os.path.isfile(os.path.join(email_path,f))] else: files = [email_path] tdm = SimpleTermDocumentMatrix() for f in files: body = get_body(f) tdm.add_doc(body) return tdm
def train(self,files): senders_map = {} threads = {} tdm = SimpleTermDocumentMatrix() for f in files: l = parse_email(f) sender = l[0] subject = l[1] date_ = l[2] body = l[3] tdm.add_doc(body) self.calc_sender_freq(sender,senders_map) self.calc_weight_on_thread(subject,sender,date_,threads) self.senders_freq = senders_map self.threads_weight = threads self.tdm = tdm