예제 #1
0
class Stats:

    def __init__(self, matrix):
        self.mx = matrix
        self.N  = 0
        self.classes = {}
        self.terms = SuperList()       
        for c in self.mx.classes:
            self.classes[c] = {}
            self.classes[c]['terms'] = self.mx.classes[c]
            self.classes[c]['total'] = sum(self.classes[c]['terms'])
            self.terms.add(self.classes[c]['terms'])
            self.N += self.classes[c]['total']
        self.mi_terms = []
        
    def __str__(self):
        s  = 'Matrix Stats:'
        s += '\n * Vocabulary/Terms: %d/%d' % (len(self.terms), self.N)
        return s
        
    def getN(self):
        ''' Get total number of terms, counting their frequencies too.
            Notice: This is not the same as len(vocabulary)
        '''
        return self.N
        
    def get_terms_freq(self, normalized=False):
        ''' Returns 2d matrix of vocabulary terms and their occurences
            if normalized is True, devide by total number of terms
        '''
        terms = self.mx.terms
        freq = self.terms.div(self.N) if normalized else self.terms
        return [terms, freq] 
            
    def pr_term(self, t):
        ' Get probability of term t '
        i = self.mx[t]
        if i == -1:
            return 0
        return float(self.terms[i]) / self.N

    def pr_class(self, c):
        ' Get probability of class c '
        return float(self.classes[c]['total']) / self.N
        
    def pr_joint(self, t, c):
        'Get joint probability between term t and class c'
        i = self.mx[t]
        if i == -1:
            return 0
        return float(self.classes[c]['terms'][i]) / self.N
        
    def mi(self):
        for t in self.mx.vocabulary():
            mi = 0
            for c in self.classes:
                try:
                    mi += self.pr_joint(t,c) * math.log10( self.pr_joint(t,c) / ( self.pr_term(t) * self.pr_class(c) ))
                except:
                    # Oh, log(0), let's set mi = 0
                    mi = 0
            self.mi_terms.append(mi) 
        print self.classes    
        print self.mi_terms