class Stats: def __init__(self, matrix): self.mx = matrix self.N = 0 self.classes = {} self.terms = SuperList() for c in self.mx.classes: self.classes[c] = {} self.classes[c]['terms'] = self.mx.classes[c] self.classes[c]['total'] = sum(self.classes[c]['terms']) self.terms.add(self.classes[c]['terms']) self.N += self.classes[c]['total'] self.mi_terms = [] def __str__(self): s = 'Matrix Stats:' s += '\n * Vocabulary/Terms: %d/%d' % (len(self.terms), self.N) return s def getN(self): ''' Get total number of terms, counting their frequencies too. Notice: This is not the same as len(vocabulary) ''' return self.N def get_terms_freq(self, normalized=False): ''' Returns 2d matrix of vocabulary terms and their occurences if normalized is True, devide by total number of terms ''' terms = self.mx.terms freq = self.terms.div(self.N) if normalized else self.terms return [terms, freq] def pr_term(self, t): ' Get probability of term t ' i = self.mx[t] if i == -1: return 0 return float(self.terms[i]) / self.N def pr_class(self, c): ' Get probability of class c ' return float(self.classes[c]['total']) / self.N def pr_joint(self, t, c): 'Get joint probability between term t and class c' i = self.mx[t] if i == -1: return 0 return float(self.classes[c]['terms'][i]) / self.N def mi(self): for t in self.mx.vocabulary(): mi = 0 for c in self.classes: try: mi += self.pr_joint(t,c) * math.log10( self.pr_joint(t,c) / ( self.pr_term(t) * self.pr_class(c) )) except: # Oh, log(0), let's set mi = 0 mi = 0 self.mi_terms.append(mi) print self.classes print self.mi_terms