def is_spam(text): trigram_probs = [] for trigram in generate_trigrams(text): if interesting.has_key(trigram): trigram_probs.append(interesting[trigram]) #print trigram, trigrams[trigram] if len(trigram_probs) > MAX_PROBS: break #print "Candidate trigrams:", len(trigram_probs), return combine_probs(trigram_probs)
for trigram, (spam, good) in trigrams.items(): ratio = float(spam) / (spam + good) if spam + good > 100 and not (0.9 > ratio > 0.1): trigram_list.append((ratio, trigram, spam, good)) trigram_list.sort() for best, worst in zip(trigram_list[:top], trigram_list[-top:]): print "%.2f %s: (%6d | %6d ) %.2f %s: (%6d | %6d )" % (best + worst) if __name__ == '__main__': # Open an existing partial model, if it exists trigrams = read_trigrams('trigrams') # Process each instruction line from STDIN for line in sys.stdin: print line.strip(), option, fname = line.split() if option not in ('SPAM', 'GOOD'): print "Specify input type: SPAM/GOOD" sys.exit() for trigram in generate_trigrams(open(fname).read()): if not trigrams.has_key(trigram): trigrams[trigram] = [0, 0] if option == 'SPAM': trigrams[trigram][0] += 1 elif option == 'GOOD': trigrams[trigram][1] += 1 print '...', len(trigrams) write_trigrams(trigrams, 'trigrams') report(trigrams)
trigram_list = [] for trigram,(spam,good) in trigrams.items(): ratio = float(spam)/(spam+good) if spam+good > 100 and not (0.9 > ratio > 0.1): trigram_list.append((ratio,trigram, spam, good)) trigram_list.sort() for best, worst in zip(trigram_list[:top],trigram_list[-top:]): print "%.2f %s: (%6d | %6d ) %.2f %s: (%6d | %6d )" % (best+worst) if __name__=='__main__': # Open an existing partial model, if it exists trigrams = read_trigrams('trigrams') # Process each instruction line from STDIN for line in sys.stdin: print line.strip(), option, fname = line.split() if option not in ('SPAM','GOOD'): print "Specify input type: SPAM/GOOD" sys.exit() for trigram in generate_trigrams(open(fname).read()): if not trigrams.has_key(trigram): trigrams[trigram] = [0,0] if option=='SPAM': trigrams[trigram][0] += 1 elif option=='GOOD': trigrams[trigram][1] += 1 print '...', len(trigrams) write_trigrams(trigrams,'trigrams') report(trigrams)