Exemplo n.º 1
0
class NaiveBayes(object):
    '''
    Naive Bayes Classifier
    '''

    def __init__(self, datafile):
        '''
        Constructor:
        @param  datafile: training set
        '''
        self.datasource = DataSource(datafile)
        
        
    def classify(self, contAttrs = [], **kwargs):
        """"
        Classify test sample/set using NB classifier
        @param contAttrs: specify the continual attributes in test sample/set
        @param kwargs: attribute-value pair or test set, once testset=... is given, only test set is used 
        """
        
        clazzes = self.datasource.getClasses()

        testset = []
        if "testset" in kwargs.keys():
            testset = kwargs["testset"]
        else:
            testset.append(kwargs)
        
        prob = 0.0
        clazzProb = []
        
        for record in testset:
            prob = 0.0
            clazzProb = []
            for clss in clazzes:
                attrval = {}
                attrval[self.datasource.targetAttr] = clss
                prob = self.probD(self.datasource.dataset, attrval)
                
                for attr, value in record.items():
                    if attr == self.datasource.targetAttr:
                        continue
                    
                    attrval = {}
                    attrval[attr] = value
                    if attr in contAttrs:
                        prob *= self.conditionalEstimation(clss, attrval, continual = True)
                    else:
                        prob *= self.conditionalEstimation(clss, attrval, continual = False)
                
                clazzProb.append([clss, prob])
            
            classlabel = [c[0] for c in clazzProb if c[1] == max(p[1] for p in clazzProb)][0]
            
            record['?'] = classlabel
        
        
        
    def probD(self, dataset, attrvalDict=None, **kwargs): 
        """"
            Calculate  probability for DISCRETE Attributes.
        """
        
        totalrecords = len(dataset)        #Total number of records           
        
        attrmatch = 0                           #Determine whether record meet the attribute(s) condition
        matchrecords = 0                     #Number of records which match the attribute(s) condition 
        
        for record in dataset:
            
            if not attrvalDict:
                attrvalDict = kwargs
                 
            for k , v in attrvalDict.items():
                if record[str(k)] == v:
                    attrmatch += 1
                    
            #If number of attributes, which match condition, equals conditions' length, 
            #then record matches  attribute conditions
            if attrmatch == len(attrvalDict): 
                matchrecords += 1
        
        #return matchrecords * 1.0 / totalrecords
        return  (matchrecords * 1.0 + 3 * 1.0 /3 ) /  ( totalrecords + 3)
    
    def probC(self, dataset, attrvalDict=None, **kwargs): 
        """"
            Calculate  probability for CONTINUAL Attributes.
            For sure kwargs only contains one attribute-value pair, if not, only the first pair  (but not guaranteed) will be used
        """
        
        if not attrvalDict:
            attrvalDict = kwargs
                
        attrname, value = attrvalDict.popitem()
        
        v_seq = [int(v[attrname]) for v in dataset]
        
        n = len(v_seq)
        seq_mean = sum(v_seq) * 1.0 / n
        seq_std = math.sqrt(sum((x - seq_mean)**2 for x in v_seq) / n)
        
        estimate = (1.0 / (math.sqrt(2 * math.pi) * seq_std)) * math.exp(-1 * (int(value) - seq_mean)**2 / (2.0 * seq_std **2))
        
        
        return estimate
    
    
    def conditionalEstimation(self, condition, attrvalDict=None, continual = False, **kwargs):
        """
        Estimates probability of attribute(s) under condition of certain class
        Note this method only suitable for DISCRETE attributes
        @param condition: certain class label 
        @param continual: True for continual attribute, the default value is False
        @param **kwargs: key-value pair of attribute(s) being estimated
        @return: conditional probability for attribute(s)
        """
        
        dataset = self.datasource.subDataSet(self.datasource.dataset, self.datasource.targetAttr, condition)
        if continual:
            return self.probC(dataset, attrvalDict, **kwargs)
        else:
            return self.probD(dataset, attrvalDict, **kwargs)
    
        
    def evidence(self, attrvalDict=None, **kwargs):
        """"
        Calculate evidence of attribute(s) condition
        Note for if kwargs contains continual attribute then evidence is not a appropriate measurement 
        when training sample is small, but for large training set, calculate evidence for continual attribute is reasonable. 
        """
        return self.probD(self.datasource.dataset, attrvalDict, **kwargs)