Exemplo n.º 1
0
def best_features():
    ham_train = bayes.HAM + bayes.TRAIN
    spam_train = bayes.SPAM + bayes.TRAIN
    ham_words = map(lambda s: '/' + s + '/', 
            set.union(*map(toolkit.word_bag, toolkit.get_files(ham_train))))
    spam_words = map(lambda s: '/' + s + '/', 
            set.union(*map(toolkit.word_bag, toolkit.get_files(spam_train))))
    all_words = set(ham_words + spam_words)

    # p(w_i | c)
    p_w_ham = defaultdict(lambda: toolkit.ZERO, zip(ham_words, 
            toolkit.countre(ham_train, ham_words, smoothing=toolkit.ONE)[0]))
    p_w_spam = defaultdict(lambda: toolkit.ZERO, zip(spam_words,
            toolkit.countre(spam_train, spam_words, smoothing=toolkit.ONE)[0]))
    #for word in all_words:
    #    if type(p_w_ham[word]) != toolkit.NUM or type(p_w_spam[word]) != toolkit.NUM:
    #        print p_w_ham[word], p_w_spam[word], word
    #return range(0, 100)
    mut_inf = dict()
    maxlog = math.log(toolkit.MAX)
    def no_error(x, y):
        try:
            return math.log(x / y)
        except Exception:
            return maxlog
    for word in all_words:
        p_w_h = p_w_ham[word]
        #if type(p_w_h)() == list() or type(p_w_s)() == list:
        #    print "'" + word + "'"
        #else:
        #    print "*",
        p_nw_h = toolkit.ONE - p_w_h
        p_w_s = p_w_spam[word]
        p_nw_s = toolkit.ONE - p_w_s
        p_w = p_w_h * toolkit.PRIOR_HAM + p_w_s * toolkit.PRIOR_SPAM
        p_nw = toolkit.ONE - p_w
        log_w_h = no_error(p_w, p_w_h)
        log_nw_h = no_error(p_nw, p_nw_h)
        log_w_s = no_error(p_w, p_w_s)
        log_nw_s = no_error(p_nw, p_nw_s)
        mut_inf[word] = p_w_h * toolkit.PRIOR_HAM * log_w_h
        mut_inf[word] += p_nw_h * toolkit.PRIOR_HAM * log_nw_h
        mut_inf[word] += p_w_s * toolkit.PRIOR_SPAM * log_w_s
        mut_inf[word] += p_nw_s * toolkit.PRIOR_SPAM * log_nw_s
    return sorted(mut_inf.iteritems(), key=operator.itemgetter(1), reverse=True)
Exemplo n.º 2
0
def instance_feature_prob(instance, features, clss, train=True, smoothing=toolkit.ZERO):
    """
    Corresponds to [p(x_i | C_k) for x_i in x] from the assignment.
    
    The instance is a message, which is either of clss HAM or clss SPAM.
    features is a collection of (compiled) regular expressions.  If applied to
    the instance, we compute for every feature, if it occurs in the instance.
    We divide that number by how many training/testing instances of the class
    match against the feature.

    ``The probability of an observation (i.e., an email) given the class (i.e., 
    ham or spam), p(x|Ck ) is then modelled as the probability of seeing
    specific keywords in the email.''
    """
    folder = clss + TRAIN if train else clss + TEST
    countre_result = toolkit.countre(folder, features, smoothing=smoothing)[0]
    feature_presence = toolkit.presentre(instance, features) 
    return map(lambda c, p: 1 if c == toolkit.ZERO and p == toolkit.ZERO else pow(c, p) * pow(1 - c, 1 - p), countre_result, feature_presence)