예제 #1
0
파일: matrix.py 프로젝트: KemingChen/IR
 def tf_idf(self, do_idf=True):
     ''' Converts matrix to tf.idf values
         do_idf: if False, convert to tf only
     '''        
     N = len(self.docs)
     df = SuperList([0] * len(self.terms))
     for doc in self.docs:
         row = SuperList([0] * len(self.terms))
         for idx in range(len(self.terms)):
             if doc['terms'][idx] > 0:
                 row[idx] = 1
         df.add(row)
     
     for doc in self.docs:
         for idx in range(len(self.terms)):
             tf = self._log_tf(doc['terms'][idx])
             idf = math.log10(float(N) / df[idx])
             if do_idf:
                 doc['terms'][idx] = tf * idf
             else:
                 doc['terms'][idx] = tf
예제 #2
0
 def tf_idf(self, do_idf=True):
     ''' Converts matrix to tf.idf values
         do_idf: if False, convert to tf only
     '''        
     N = len(self.docs)
     df = SuperList([0] * len(self.terms))
     for doc in self.docs:
         row = SuperList([0] * len(self.terms))
         for idx in range(len(self.terms)):
             if doc['terms'][idx] > 0:
                 row[idx] = 1
         df.add(row)
     
     for doc in self.docs:
         for idx in range(len(self.terms)):
             tf = self._log_tf(doc['terms'][idx])
             idf = math.log10(float(N) / df[idx])
             if do_idf:
                 doc['terms'][idx] = tf * idf
             else:
                 doc['terms'][idx] = tf
예제 #3
0
class Stats:

    def __init__(self, matrix):
        self.mx = matrix
        self.N  = 0
        self.classes = {}
        self.terms = SuperList()       
        for c in self.mx.classes:
            self.classes[c] = {}
            self.classes[c]['terms'] = self.mx.classes[c]
            self.classes[c]['total'] = sum(self.classes[c]['terms'])
            self.terms.add(self.classes[c]['terms'])
            self.N += self.classes[c]['total']
        self.mi_terms = []
        
    def __str__(self):
        s  = 'Matrix Stats:'
        s += '\n * Vocabulary/Terms: %d/%d' % (len(self.terms), self.N)
        return s
        
    def getN(self):
        ''' Get total number of terms, counting their frequencies too.
            Notice: This is not the same as len(vocabulary)
        '''
        return self.N
        
    def get_terms_freq(self, normalized=False):
        ''' Returns 2d matrix of vocabulary terms and their occurences
            if normalized is True, devide by total number of terms
        '''
        terms = self.mx.terms
        freq = self.terms.div(self.N) if normalized else self.terms
        return [terms, freq] 
            
    def pr_term(self, t):
        ' Get probability of term t '
        i = self.mx[t]
        if i == -1:
            return 0
        return float(self.terms[i]) / self.N

    def pr_class(self, c):
        ' Get probability of class c '
        return float(self.classes[c]['total']) / self.N
        
    def pr_joint(self, t, c):
        'Get joint probability between term t and class c'
        i = self.mx[t]
        if i == -1:
            return 0
        return float(self.classes[c]['terms'][i]) / self.N
        
    def mi(self):
        for t in self.mx.vocabulary():
            mi = 0
            for c in self.classes:
                try:
                    mi += self.pr_joint(t,c) * math.log10( self.pr_joint(t,c) / ( self.pr_term(t) * self.pr_class(c) ))
                except:
                    # Oh, log(0), let's set mi = 0
                    mi = 0
            self.mi_terms.append(mi) 
        print self.classes    
        print self.mi_terms