def chi2_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. wordstream is an iterable object producing words. The return value is a float in [0.0, 1.0]. If optional arg evidence is True, the return value is a pair probability, evidence where evidence is a list of (word, probability) pairs. """ from math import frexp, log as ln H = S = 1.0 Hexp = Sexp = 0 clues = self._getclues(wordstream) for prob, word, record in clues: S *= 1.0 - prob H *= prob if S < 1e-200: # prevent underflow S, e = frexp(S) Sexp += e if H < 1e-200: # prevent underflow H, e = frexp(H) Hexp += e S = ln(S) + Sexp * LN2 H = ln(H) + Hexp * LN2 n = len(clues) if n: S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) prob = (S-H + 1.0) / 2.0 else: prob = 0.5 if evidence: clues = [(w, p) for p, w, r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) clues.insert(0, ('*S*', S)) clues.insert(0, ('*H*', H)) return prob, clues else: return prob
def chi2_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. wordstream is an iterable object producing words. The return value is a float in [0.0, 1.0]. If optional arg evidence is True, the return value is a pair probability, evidence where evidence is a list of (word, probability) pairs. """ from math import frexp, log as ln # We compute two chi-squared statistics, one for ham and one for # spam. The sum-of-the-logs business is more sensitive to probs # near 0 than to probs near 1, so the spam measure uses 1-p (so # that high-spamprob words have greatest effect), and the ham # measure uses p directly (so that lo-spamprob words have greatest # effect). # # For optimization, sum-of-logs == log-of-product, and f.p. # multiplication is a lot cheaper than calling ln(). It's easy # to underflow to 0.0, though, so we simulate unbounded dynamic # range via frexp. The real product H = this H * 2**Hexp, and # likewise the real product S = this S * 2**Sexp. H = S = 1.0 Hexp = Sexp = 0 clues = self._getclues(wordstream) for prob, word, record in clues: S *= 1.0 - prob H *= prob if S < 1e-200: # prevent underflow S, e = frexp(S) Sexp += e if H < 1e-200: # prevent underflow H, e = frexp(H) Hexp += e # Compute the natural log of the product = sum of the logs: # ln(x * 2**i) = ln(x) + i * ln(2). S = ln(S) + Sexp * LN2 H = ln(H) + Hexp * LN2 n = len(clues) if n: S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) # How to combine these into a single spam score? We originally # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A # systematic problem is that we could end up being near-certain # a thing was (for example) spam, even if S was small, provided # that H was much smaller. # Rob Hooft stared at these problems and invented the measure # we use now, the simpler S-H, scaled into [0., 1.]. prob = (S-H + 1.0) / 2.0 else: prob = 0.5 if evidence: clues = [(w, p) for p, w, _r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) clues.insert(0, ('*S*', S)) clues.insert(0, ('*H*', H)) return prob, clues else: return prob
def chi2_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. wordstream is an iterable object producing words. The return value is a float in [0.0, 1.0]. If optional arg evidence is True, the return value is a pair probability, evidence where evidence is a list of (word, probability) pairs. """ from math import frexp, log as ln # We compute two chi-squared statistics, one for ham and one for # spam. The sum-of-the-logs business is more sensitive to probs # near 0 than to probs near 1, so the spam measure uses 1-p (so # that high-spamprob words have greatest effect), and the ham # measure uses p directly (so that lo-spamprob words have greatest # effect). # # For optimization, sum-of-logs == log-of-product, and f.p. # multiplication is a lot cheaper than calling ln(). It's easy # to underflow to 0.0, though, so we simulate unbounded dynamic # range via frexp. The real product H = this H * 2**Hexp, and # likewise the real product S = this S * 2**Sexp. H = S = 1.0 Hexp = Sexp = 0 clues = self._getclues(wordstream) """ wordstream.allclues = list(set(wordstream.allclues + clues)) """ for prob, word, record in clues: S *= 1.0 - prob H *= prob if S < 1e-200: # prevent underflow S, e = frexp(S) Sexp += e if H < 1e-200: # prevent underflow H, e = frexp(H) Hexp += e # Compute the natural log of the product = sum of the logs: # ln(x * 2**i) = ln(x) + i * ln(2). S = ln(S) + Sexp * LN2 H = ln(H) + Hexp * LN2 n = len(clues) if n: S = 1.0 - chi2Q(-2.0 * S, 2 * n) H = 1.0 - chi2Q(-2.0 * H, 2 * n) # How to combine these into a single spam score? We originally # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A # systematic problem is that we could end up being near-certain # a thing was (for example) spam, even if S was small, provided # that H was much smaller. # Rob Hooft stared at these problems and invented the measure # we use now, the simpler S-H, scaled into [0., 1.]. prob = (S - H + 1.0) / 2.0 else: prob = 0.5 if evidence: clues = [(w, p) for p, w, _r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) clues.insert(0, ('*S*', S)) clues.insert(0, ('*H*', H)) wordstream.prob = prob return prob, clues else: wordstream.prob = prob return prob