Exemplo n.º 1
0
class Test(unittest.TestCase):


    def setUp(self):
        self.datafile = '../data'
        self.ds = DataSource(self.datafile)

    def testInitialization(self):
        ds = DataSource('../data')
        self.assertEqual(ds.targetAttr, 'Purchase?')
        self.assertEqual(len(ds.dataset), 20)
        self.assertEqual(ds.datafile, self.datafile)
        
    def testInitialization2(self):
        ds = DataSource('../data', 'Purchase?')
        self.assertEqual(ds.targetAttr, 'Purchase?')
        self.assertEqual(len(ds.dataset), 20)
        self.assertEqual(ds.datafile, self.datafile)
    
    def testMojorityValue(self):
        self.assertEqual(self.ds.majorityValue(self.ds.dataset, 'Purchase?'),'will buy')
    
    def testDualization(self):
        self.ds.dualization('../data.o')
        
   
    def tearDown(self):
        del self.ds
Exemplo n.º 2
0
Arquivo: dtree.py Projeto: yoyzhou/dmP
 def __init__(self, datafile, splitmetric):
     """
     Constructor:
     @param datafile: training set, path to training data file
     @param splitmetric: fitness metric/function to choose best splitter 
     """
     self.datasource = DataSource(datafile)
     self.splitmetric = splitmetric
     self.decisiontree = None
Exemplo n.º 3
0
Arquivo: dtree.py Projeto: yoyzhou/dmP
class DecisionTree(object):
    """
       A Decision Tree Class
    """
    def __init__(self, datafile, splitmetric):
        """
        Constructor:
        @param datafile: training set, path to training data file
        @param splitmetric: fitness metric/function to choose best splitter 
        """
        self.datasource = DataSource(datafile)
        self.splitmetric = splitmetric
        self.decisiontree = None
        
    def createDecisionTree(self):
        """
        Create a decision tree against training set
        """
        self.decisiontree = self.__treeGrowth__(self.datasource.dataset, 
                                                self.datasource.attributes, 
                                                self.datasource.targetAttr)
        return self
    
    def makeDecision(self, sample = None, testset = None, testfile = None ):
        """
        Make decision against test set/sample
        """
        testsamples = []
        if testfile:
            ds = DataSource(testfile)
            testsamples.extend( ds.dataset )
        elif testset:
            testsamples.extend(testset)
        elif sample:
            testsamples.append(sample)
        else:
            raise ValueError('No test set passed in.')
       

        for test in testsamples:
            dtree = self.decisiontree
            while True:
                attr = dtree.data.attribute
                node = dtree.getNode(test[attr], False)
                if node.isBranch():
                    test['decision?'] = node.data.attribute
                    break
                else:
                    dtree = node
                    
            
            print(test)
    
    def prettyTree(self):
        """"
        Print decision tree
        """
        try:
            self.decisiontree.prettyTree()
        except AttributeError:
            raise AttributeError('You havn\'t create decision tree yet, please make sure you have called createDecisionTree.')
        
    def __treeGrowth__(self, dataset, attributes, target):
        """
        Grows decision tree based on training set
        @param dataset: training set
        @param attributes: attribute set, which may contains target attribute
        @param target: target attribute   
        """
        
        #Target values
        tvals = [record[target] for record in dataset]
        
        default = self.datasource.majorityValue(dataset)
    
        # If the data set is empty or the attributes list is empty, return the
        # default value. 
        if not dataset or (len(attributes) - 1) <= 0:
            return Tree(DecisionNode(default))
        
        # If all the records in the data set have the same classification,
        # return that classification.
        elif tvals.count(tvals[0]) == len(tvals):
            return Tree(DecisionNode(tvals[0]))
        else:
            # Choose best attribute to best classify data
            best = self.splitmetric(dataset, attributes, target)
    
            # Create a new decision tree/node with the best attribute
            dtree = Tree(DecisionNode(best))
            
            #Attributes for next iterator, all attributes except already fitted `best` attribute
            attrs = [attr for attr in attributes if attr != best]
            
            # Create a new decision tree/sub-node for each of the values in the
            # best attribute field
            for val in self.datasource.uniqueValues(dataset, best):
                # Create a subtree for the current value under the "best" field
                subtree = self.__treeGrowth__(
                    self.datasource.subDataSet(dataset, best, val),
                    attrs, 
                    target)
    
                # Set decision condition, and add the new subtree
                subtree.data.condition = val  
                dtree.addChild( subtree)
                
    
            return dtree
Exemplo n.º 4
0
class NaiveBayes(object):
    '''
    Naive Bayes Classifier
    '''

    def __init__(self, datafile):
        '''
        Constructor:
        @param  datafile: training set
        '''
        self.datasource = DataSource(datafile)
        
        
    def classify(self, contAttrs = [], **kwargs):
        """"
        Classify test sample/set using NB classifier
        @param contAttrs: specify the continual attributes in test sample/set
        @param kwargs: attribute-value pair or test set, once testset=... is given, only test set is used 
        """
        
        clazzes = self.datasource.getClasses()

        testset = []
        if "testset" in kwargs.keys():
            testset = kwargs["testset"]
        else:
            testset.append(kwargs)
        
        prob = 0.0
        clazzProb = []
        
        for record in testset:
            prob = 0.0
            clazzProb = []
            for clss in clazzes:
                attrval = {}
                attrval[self.datasource.targetAttr] = clss
                prob = self.probD(self.datasource.dataset, attrval)
                
                for attr, value in record.items():
                    if attr == self.datasource.targetAttr:
                        continue
                    
                    attrval = {}
                    attrval[attr] = value
                    if attr in contAttrs:
                        prob *= self.conditionalEstimation(clss, attrval, continual = True)
                    else:
                        prob *= self.conditionalEstimation(clss, attrval, continual = False)
                
                clazzProb.append([clss, prob])
            
            classlabel = [c[0] for c in clazzProb if c[1] == max(p[1] for p in clazzProb)][0]
            
            record['?'] = classlabel
        
        
        
    def probD(self, dataset, attrvalDict=None, **kwargs): 
        """"
            Calculate  probability for DISCRETE Attributes.
        """
        
        totalrecords = len(dataset)        #Total number of records           
        
        attrmatch = 0                           #Determine whether record meet the attribute(s) condition
        matchrecords = 0                     #Number of records which match the attribute(s) condition 
        
        for record in dataset:
            
            if not attrvalDict:
                attrvalDict = kwargs
                 
            for k , v in attrvalDict.items():
                if record[str(k)] == v:
                    attrmatch += 1
                    
            #If number of attributes, which match condition, equals conditions' length, 
            #then record matches  attribute conditions
            if attrmatch == len(attrvalDict): 
                matchrecords += 1
        
        #return matchrecords * 1.0 / totalrecords
        return  (matchrecords * 1.0 + 3 * 1.0 /3 ) /  ( totalrecords + 3)
    
    def probC(self, dataset, attrvalDict=None, **kwargs): 
        """"
            Calculate  probability for CONTINUAL Attributes.
            For sure kwargs only contains one attribute-value pair, if not, only the first pair  (but not guaranteed) will be used
        """
        
        if not attrvalDict:
            attrvalDict = kwargs
                
        attrname, value = attrvalDict.popitem()
        
        v_seq = [int(v[attrname]) for v in dataset]
        
        n = len(v_seq)
        seq_mean = sum(v_seq) * 1.0 / n
        seq_std = math.sqrt(sum((x - seq_mean)**2 for x in v_seq) / n)
        
        estimate = (1.0 / (math.sqrt(2 * math.pi) * seq_std)) * math.exp(-1 * (int(value) - seq_mean)**2 / (2.0 * seq_std **2))
        
        
        return estimate
    
    
    def conditionalEstimation(self, condition, attrvalDict=None, continual = False, **kwargs):
        """
        Estimates probability of attribute(s) under condition of certain class
        Note this method only suitable for DISCRETE attributes
        @param condition: certain class label 
        @param continual: True for continual attribute, the default value is False
        @param **kwargs: key-value pair of attribute(s) being estimated
        @return: conditional probability for attribute(s)
        """
        
        dataset = self.datasource.subDataSet(self.datasource.dataset, self.datasource.targetAttr, condition)
        if continual:
            return self.probC(dataset, attrvalDict, **kwargs)
        else:
            return self.probD(dataset, attrvalDict, **kwargs)
    
        
    def evidence(self, attrvalDict=None, **kwargs):
        """"
        Calculate evidence of attribute(s) condition
        Note for if kwargs contains continual attribute then evidence is not a appropriate measurement 
        when training sample is small, but for large training set, calculate evidence for continual attribute is reasonable. 
        """
        return self.probD(self.datasource.dataset, attrvalDict, **kwargs)
        
Exemplo n.º 5
0
 def __init__(self, datafile):
     '''
     Constructor:
     @param  datafile: training set
     '''
     self.datasource = DataSource(datafile)
Exemplo n.º 6
0
 def setUp(self):
     self.datafile = '../data'
     self.ds = DataSource(self.datafile)