Пример #1
0
 def prune(self):
     if not self.m_isLeaf:
         for i in range(len(self.m_sons)):
             self.son(i).prune()
         indexOfLargestBranch = self.localModel().distribution().maxBag()
         if self.m_subtreeRaising:
             errorsLargestBranch = self.son(
                 indexOfLargestBranch).getEstimatedErrorsForBranch(
                     self.m_train)
         else:
             errorsLargestBranch = float("inf")
         errorsLeaf = self.getEstimatedErrorsForDistribution(
             self.localModel().distribution())
         errorsTree = self.getEstimatedErrors()
         if (Utils.gr(errorsTree+0.1, errorsLeaf) or Utils.equal(errorsTree+0.1, errorsLeaf)) and\
             (Utils.gr(errorsLargestBranch+0.1, errorsLeaf) or Utils.equal(errorsLargestBranch+0.1, errorsLeaf)):
             self.m_sons = None
             self.m_isLeaf = True
             self.m_localModel = NoSplit(self.localModel().distribution())
             return
         if Utils.gr(errorsTree + 0.1, errorsLargestBranch) or Utils.equal(
                 errorsTree + 0.1, errorsLargestBranch):
             largestBranch = self.son(indexOfLargestBranch)
             self.m_sons = largestBranch.m_sons
             self.m_localModel = largestBranch.localModel()
             self.m_isLeaf = largestBranch.m_isLeaf
             self.newDistribution(self.m_train)
             self.prune()
Пример #2
0
 def split(self,data:Instances)->List[Instances]:
     subsetSize=[0]*self.m_numSubsets
     for inst in data:
         subset=self.whichSubset(inst)
         if subset > -1:
             subsetSize[subset]+=1
         else:
             weights=self.weights(inst)
             for j in range(self.m_numSubsets):
                 if Utils.gr(weights[j], 0):
                     subsetSize[j]+=1
     instances=[]        #type:List[Instances]
     for j in range(self.m_numSubsets):
         instances.append(Instances(data,subsetSize[j]))
     for inst in data:
         subset=self.whichSubset(inst)
         if subset > -1:
             instances[subset].add(inst)
         else:
             weights=self.weights(inst)
             for j in range(self.m_numSubsets):
                 if Utils.gr(weights[j], 0):
                     instances[j].add(inst)
                     instances[j].lastInstance().setWeight(float(weights[j]*inst.weight()))
     return instances
Пример #3
0
 def computeAverageClassValues(self):
     avgClassValues = [[]
                       for i in range(self.getInputFormat().numAttributes())
                       ]
     self.m_Indices = [[]
                       for i in range(self.getInputFormat().numAttributes())
                       ]
     for j in range(self.getInputFormat().numAttributes()):
         att = self.getInputFormat().attribute(j)
         if att.isNominal():
             avgClassValues[j] = [0] * att.numValues()
             counts = [0] * att.numValues()
             for i in range(self.getInputFormat().numInstances()):
                 instance = self.getInputFormat().instance(i)
                 if not instance.classIsMissing(
                 ) and not instance.isMissing(j):
                     counts[int(instance.value(j))] += instance.weight()
                     avgClassValues[j][int(instance.value(
                         j))] += instance.weight() * instance.weight()
             sums = sum(avgClassValues[j])
             totalCounts = sum(counts)
             if Utils.gr(totalCounts, 0):
                 for k in range(att.numValues()):
                     if Utils.gr(counts[k], 0):
                         avgClassValues[j][k] /= counts[k]
                     else:
                         avgClassValues[j][k] = sums / totalCounts
             self.m_Indices[j] = Utils.sortDouble(avgClassValues[j])
Пример #4
0
 def selectModel(self, data: Instances, test: Instances = None):
     if test is not None:
         return self.selectModel(data)
     multiVal = True
     averageInfoGain = validModels = 0
     checkDistribution = Distribution(data)
     noSplitModel = NoSplit(checkDistribution)
     if Utils.gr(2*self.m_minNoObj, checkDistribution.total()) or \
         Utils.equal(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass())):
         return noSplitModel
     if self.m_allData is not None:
         for attr in data.enumerateAttributes():
             if attr.isNumeric() or Utils.gr(
                     0.3 * self.m_allData.numInstances(), attr.numValues()):
                 multiVal = False
                 break
     currentModel = [None] * data.numAttributes()  #type:List[C45Split]
     sumOfWeights = data.sumOfWeight()
     for i in range(data.numAttributes()):
         if i != data.classIndex():
             currentModel[i] = C45Split(i, self.m_minNoObj, sumOfWeights,
                                        self.m_useMDLcorrection)
             currentModel[i].buildClassifer(data)
             if currentModel[i].checkModel():
                 if self.m_allData is not None:
                     if data.attribute(i).isNumeric() or \
                         (multiVal or Utils.gr(0.3*self.m_allData.numInstances(), data.attribute(i).numValues())):
                         averageInfoGain = averageInfoGain + currentModel[
                             i].infoGain()
                         validModels += 1
                 else:
                     averageInfoGain = averageInfoGain + currentModel[
                         i].infoGain()
                     validModels += 1
         else:
             currentModel[i] = None
     if validModels == 0:
         return noSplitModel
     averageInfoGain = averageInfoGain / validModels
     minResult = 0
     for i in range(data.numAttributes()):
         if i != data.classIndex() and currentModel[i].checkModel():
             if currentModel[i].infoGain() >= averageInfoGain-1e-3 and\
                 Utils.gr(currentModel[i].gainRatio(), minResult):
                 bestModel = currentModel[i]
                 minResult = currentModel[i].gainRatio()
     if Utils.equal(minResult, 0):
         return noSplitModel
     bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex())
     if self.m_allData is not None and not self.m_doNotMakeSplitPointActualValue:
         bestModel.setSplitPoint(self.m_allData)
     return bestModel
Пример #5
0
 def splitCritValue(self,
                    bags: Distribution,
                    totalNoInst: float = None,
                    numerator: float = None):
     if totalNoInst is None and numerator is None:
         numerator = self.oldEnt(bags) - self.newEnt(bags)
         if Utils.equal(numerator, 0):
             return float('inf')
         denumerator = self.splitEnt(bags)
         if Utils.equal(denumerator, 0):
             return float('inf')
         return denumerator / numerator
     elif numerator is None:
         res = 0
         noUnkown = totalNoInst - bags.total()
         if Utils.gr(bags.total(), 0):
             for i in range(bags.numBags()):
                 res = res - self.lnFunc(bags.perBag(i))
             res = res - self.lnFunc(noUnkown)
             res = res + self.lnFunc(totalNoInst)
         return res / math.log(2)
     else:
         denumerator = self.splitEnt(bags, totalNoInst)
         if Utils.equal(denumerator, 0):
             return 0
         denumerator /= totalNoInst
         return numerator / denumerator
Пример #6
0
 def buildClassifier(self, instances: Instances):
     self.getCapabilities().testWithFail(instances)
     sumOfWeights = 0
     self.m_Class = instances.classAttribute()
     self.m_ClassValue = 0
     attrType = instances.classAttribute().type()
     if attrType == Attribute.NUMERIC:
         self.m_Counts = None
     elif attrType == Attribute.NOMINAL:
         self.m_Counts = []
         for i in range(instances.numClasses()):
             self.m_Counts.append(1)
         sumOfWeights = instances.numClasses()
     for instance in instances:
         classValue = instance.classValue()
         if not Utils.isMissingValue(classValue):
             if instances.classAttribute().isNominal():
                 self.m_Counts[classValue] += instance.weight()
             else:
                 self.m_ClassValue += instance.weight() * classValue
             sumOfWeights += instance.weight()
     if instances.classAttribute().isNumeric():
         if Utils.gr(sumOfWeights, 0):
             self.m_ClassValue /= sumOfWeights
     else:
         self.m_ClassValue = Utils.maxIndex(self.m_Counts)
         Utils.normalize(self.m_Counts, sumOfWeights)
Пример #7
0
 def maxClass(self, index: int = None):
     maxCount = 0
     maxIndex = 0
     if index is None:
         for i in range(len(self.m_perClass)):
             if Utils.gr(self.m_perClass[i], maxCount):
                 maxCount = self.m_perClass[i]
                 maxIndex = i
         return maxIndex
     else:
         if Utils.gr(self.m_perBag[index], 0):
             for i in range(len(self.m_perClass)):
                 if Utils.gr(self.m_perClassPerBag[index][i], maxCount):
                     maxCount = self.m_perClassPerBag[index][i]
                     maxIndex = i
             return maxIndex
         return self.maxClass()
Пример #8
0
 def dumpLabel(self,index:int,data:Instances):
     text=""
     text+=data.classAttribute().value(self.m_distribution.maxClass(index))
     text+=" ("+str(Utils.roundDouble(self.m_distribution.perBag(index), 2))
     if Utils.gr(self.m_distribution.numIncorrect(index), 0):
         text+="/"+str(Utils.roundDouble(self.m_distribution.numIncorrect(index), 2))
     text+=")"
     return text
Пример #9
0
 def handleNumericAttribute(self, trainInstances: Instances):
     next = 1
     last = 0
     splitIndex = -1
     self.m_distribution = Distribution(2, trainInstances.numClasses())
     i = 0
     for inst in trainInstances:
         if inst.isMissing(self.m_attIndex):
             break
         self.m_distribution.add(1, inst)
         i += 1
     firstMiss = i
     minSplit = 0.1 * self.m_distribution.total(
     ) / trainInstances.numClasses()
     if Utils.gr(self.m_minNoObj, minSplit) or Utils.equal(
             minSplit, self.m_minNoObj):
         minSplit = self.m_minNoObj
     elif Utils.gr(minSplit, 25):
         minSplit = 25
     if Utils.gr(2 * minSplit, firstMiss):
         return
     defaultEnt = self.infoGainCrit.oldEnt(self.m_distribution)
     print("dfalut", defaultEnt)
     while next < firstMiss:
         if trainInstances.instance(next - 1).value(
                 self.m_attIndex) + 1e-5 < trainInstances.instance(
                     next).value(self.m_attIndex):
             self.m_distribution.shiftRange(1, 0, trainInstances, last,
                                            next)
             if (Utils.gr(self.m_distribution.perBag(0), minSplit) or Utils.equal(self.m_distribution.perBag(0), minSplit))\
                     and (Utils.gr(self.m_distribution.perBag(1), minSplit) or Utils.equal(self.m_distribution.perBag(1), minSplit)):
                 currentInfoGain = self.infoGainCrit.splitCritValue(
                     self.m_distribution, self.m_sumOfWeights, defaultEnt)
                 if Utils.gr(currentInfoGain, self.m_infoGain):
                     self.m_infoGain = currentInfoGain
                     splitIndex = next - 1
                 self.m_index += 1
             last = next
         next += 1
     if self.m_index == 0:
         return
     if self.m_useMDLcorrection:
         self.m_infoGain = self.m_infoGain - (Utils.log2(self.m_index) /
                                              self.m_sumOfWeights)
     if Utils.gr(0, self.m_infoGain) or Utils.equal(0, self.m_infoGain):
         return
     self.m_numSubsets = 2
     self.m_splitPoint = (
         trainInstances.instance(splitIndex + 1).value(self.m_attIndex) +
         trainInstances.instance(splitIndex).value(self.m_attIndex)) / 2
     if self.m_splitPoint == trainInstances.instance(splitIndex + 1).value(
             self.m_attIndex):
         self.m_splitPoint = trainInstances.instance(splitIndex).value(
             self.m_attIndex)
     self.m_distribution = Distribution(2, trainInstances.numClasses())
     self.m_distribution.addRange(0, trainInstances, 0, splitIndex + 1)
     self.m_distribution.addRange(1, trainInstances, splitIndex + 1,
                                  firstMiss)
     self.m_gainRatio = self.gainRatioCrit.splitCritValue(
         self.m_distribution, self.m_sumOfWeights, self.m_infoGain)
Пример #10
0
 def maxBag(self):
     max = 0
     maxIndex = -1
     for i in range(len(self.m_perBag)):
         if Utils.gr(self.m_perBag[i], max) or Utils.equal(
                 self.m_perBag[i], max):
             max = self.m_perBag[i]
             maxIndex = i
     return maxIndex
Пример #11
0
 def check(self, minNoObj: float):
     counter = 0
     for i in range(len(self.m_perBag)):
         if Utils.gr(self.m_perBag[i], minNoObj) or Utils.equal(
                 self.m_perBag[i], minNoObj):
             counter += 1
             if counter > 1:
                 return True
     return False
Пример #12
0
 def laplaceProb(self, classIndex: int, intIndex: int = None):
     if intIndex is None:
         return (self.m_perClass[classIndex] + 1) / (self.totaL +
                                                     len(self.m_perClass))
     else:
         if Utils.gr(self.m_perBag[intIndex], 0):
             return (self.m_perClassPerBag[intIndex][classIndex] +
                     1) / (self.m_perBag[intIndex] + len(self.m_perClass))
         return self.laplaceProb(classIndex)
Пример #13
0
 def prune(self):
     if not self.m_isLeaf:
         for i in range(len(self.m_sons)):
             self.son(i).prune()
         if Utils.gr(self.errorsForTree(),
                     self.errorsForLeaf()) or Utils.equal(
                         self.errorsForTree(), self.errorsForLeaf()):
             self.m_sons = None
             self.m_isLeaf = None
             self.m_localModel = NoSplit(self.localModel().distribution())
Пример #14
0
 def prob(self, classIndex: int, intIndex: int = None):
     if intIndex is None:
         if not Utils.equal(self.totaL, 0):
             return self.m_perClass[classIndex] / self.totaL
         return 0
     else:
         if Utils.gr(self.m_perBag[intIndex], 0):
             return self.m_perClassPerBag[intIndex][
                 classIndex] / self.m_perBag[intIndex]
         return self.prob(classIndex)
Пример #15
0
 def splitEnt(self, bags: Distribution, totalnoInst: float = None):
     if totalnoInst is None:
         return super().splitEnt(bags)
     res = 0
     noUnknown = totalnoInst - bags.total()
     if Utils.gr(bags.total(), 0):
         for i in range(bags.numBags()):
             res = res - self.lnFunc(bags.perBag(i))
         res = res - self.lnFunc(noUnknown)
         res = res + self.lnFunc(totalnoInst)
     return res / math.log(2)
Пример #16
0
 def handleEnumeratedAttribute(self, instances: Instances):
     numAttValues = instances.attribute(self.m_attIndex).numValues()
     newDistribution = Distribution(numAttValues, instances.numClasses())
     for inst in instances:
         if not inst.isMissing(self.m_attIndex):
             newDistribution.add(int(inst.value(self.m_attIndex)), inst)
     self.m_distribution = newDistribution
     for i in range(numAttValues):
         if Utils.gr(newDistribution.perBag(i), self.m_minNoObj) or\
                 Utils.equal(newDistribution.perBag(i), self.m_minNoObj):
             secondDistribution = Distribution(newDistribution, i)
             if secondDistribution.check(self.m_minNoObj):
                 self.m_numSubsets = 2
                 currIG = self.infoGainCrit.splitCritValue(
                     secondDistribution, self.m_sumOfWeights)
                 currGR = self.gainRatioCrit.splitCritValue(
                     secondDistribution, self.m_sumOfWeights, currIG)
                 if i == 0 or Utils.gr(currGR, self.m_gainRatio):
                     self.m_gainRatio = currGR
                     self.m_infoGain = currIG
                     self.m_splitPoint = i
                     self.m_distribution = secondDistribution
Пример #17
0
 def batchFinished(self):
     if self.getInputFormat() is None:
         raise Exception("No input instance format defined")
     if self.m_ModesAndMeans is None:
         sumOfWeights=self.getInputFormat().sumOfWeight()
         counts=[[] for k in range(self.getInputFormat().numAttributes())]
         for i in range(self.getInputFormat().numAttributes()):
             if self.getInputFormat().attribute(i).isNominal():
                 counts[i]=[0]*self.getInputFormat().attribute(i).numValues()
                 if len(counts[i]) > 0:
                     counts[i][0]=sumOfWeights
         sums=[]
         for i in range(self.getInputFormat().numAttributes()):
             sums.append(sumOfWeights)
         results=[0]*self.getInputFormat().numAttributes()
         for j in range(self.getInputFormat().numInstances()):
             inst=self.getInputFormat().instance(j)
             for i in range(inst.numValues()):
                 if not inst.isMissingSparse(i):
                     value=inst.valueSparse(i)
                     if inst.attributeSparse(i).isNominal():
                         if len(counts[inst.index(i)]) > 0:
                             counts[inst.index(i)][int(value)]+=inst.weight()
                             counts[inst.index(i)][0]-=inst.weight()
                     elif inst.attributeSparse(i).isNumeric():
                         results[inst.index(i)]+=inst.weight()*inst.valueSparse(i)
                 else:
                     if inst.attributeSparse(i).isNominal():
                         if len(counts[inst.index(i)]) > 0 :
                             counts[inst.index(i)][0]-=inst.weight()
                     elif inst.attributeSparse(i).isNumeric():
                         sums[inst.index(i)]-=inst.weight()
         self.m_ModesAndMeans=[0]*self.getInputFormat().numAttributes()
         for i in range(self.getInputFormat().numAttributes()):
             if self.getInputFormat().attribute(i).isNominal():
                 if len(counts[i]) == 0:
                     self.m_ModesAndMeans[i]= Utils.missingValue()
                 else:
                     self.m_ModesAndMeans[i]= Utils.maxIndex(counts[i])
             elif self.getInputFormat().attribute(i).isNumeric():
                 if Utils.gr(sums[i], 0):
                     self.m_ModesAndMeans[i]=results[i]/sums[i]
         for i in range(self.getInputFormat().numInstances()):
             self.convertInstance(self.getInputFormat().instance(i))
     self.flushInput()
     self.m_NewBatch=True
     return self.numPendingOutput() != 0
Пример #18
0
    def toSummaryString(self,printComplexityStatistics:bool,title:str="=== Summary ===\n"):
        if printComplexityStatistics and self.m_NoPriors:
            printComplexityStatistics=False
        text=title+'\n'
        if self.m_WithClass > 0:
            if self.m_ClassIsNominal:
                displayCorrect="correct" in self.m_metricsToDisplay
                displayIncorrect="incorrect" in self.m_metricsToDisplay
                displayKappa="kappa" in self.m_metricsToDisplay


                if displayCorrect:
                    text+="Correctly Classified Instances     "
                    text+= Utils.doubleToString(self.correct(), 12, 4) + "     " + Utils.doubleToString(self.pctCorrect(), 12, 4) + " %\n"
                if displayIncorrect:
                    text+="Incorrectly Classified Instances   "
                    text+= Utils.doubleToString(self.incorrect(), 12, 4) + "     " + Utils.doubleToString(self.pctIncorrect(), 12, 4) + " %\n"
                if displayKappa:
                    text+="Kappa statistic                    "
                    text+= Utils.doubleToString(self.kappa(), 12, 4) + "\n"
                if printComplexityStatistics:
                    displayKBRelative="kb relative" in self.m_metricsToDisplay
                    displayKBInfo="kb information" in self.m_metricsToDisplay
                    if displayKBRelative:
                        text+="K&B Relative Info Score            "
                        text+= Utils.doubleToString(self.KBRelativeInformation(), 12, 4) + " %\n"
                    if displayKBInfo:
                        text+="K&B Information Score              "
                        text+= Utils.doubleToString(self.KBInformation(), 12, 4) + " bits"
                        text+= Utils.doubleToString(self.KBMeanInformation(), 12, 4) + " bits/instance\n"
                #if self.m_pluginMetrics != null:
            else:
                displayCorrelation="correlation" in self.m_metricsToDisplay
                if displayCorrelation:
                    text+="Correlation coefficient            "
                    text+= Utils.doubleToString(self.correlationCoefficient(), 12, 4) + "\n"
                # if self.m_pluginMetrics != null:
            if printComplexityStatistics and self.m_ComplexityStatisticsAvailable:
                displayComplexityOrder0="complexity 0" in self.m_metricsToDisplay
                displayComplexityScheme="complexity scheme" in self.m_metricsToDisplay
                displayComplexityImprovement="complexity improvement" in self.m_metricsToDisplay
                if displayComplexityOrder0:
                    text+="Class complexity | order 0         "
                    text+= Utils.doubleToString(self.SFPriorEntropy(), 12, 4) + " bits"
                    text+= Utils.doubleToString(self.SFMeanPriorEntropy(), 12, 4) + " bits/instance\n"
                if displayComplexityScheme:
                    text+="Class complexity | scheme          "
                    text+= Utils.doubleToString(self.SFSchemeEntropy(), 12, 4) + " bits"
                    text+= Utils.doubleToString(self.SFMeanSchemeEntropy(), 12, 4) + " bits/instance\n"
                if displayComplexityImprovement:
                    text+="Complexity improvement     (Sf)    "
                    text+= Utils.doubleToString(self.SFEntropyGain(), 12, 4) + " bits"
                    text+= Utils.doubleToString(self.SFMeanEntropyGain(), 12, 4) + " bits/instance\n"
            displayMAE = "mae" in self.m_metricsToDisplay
            displayRMSE = "rmse" in self.m_metricsToDisplay
            displayRAE = "rae" in self.m_metricsToDisplay
            displayRRSE = "rrse" in self.m_metricsToDisplay
            if displayMAE:
                text+="Mean absolute error                "
                text+= Utils.doubleToString(self.meanAbsoluteError(), 12, 4) + "\n"
            if displayRMSE:
                text+="Root mean squared error            "
                text+= Utils.doubleToString(self.rootMeanSquaredError(), 12, 4) + "\n"
            if not self.m_NoPriors:
                if displayRAE:
                    text+="Relative absolute error            "
                    text+= Utils.doubleToString(self.relativeAbsoluteError(), 12, 4) + " %\n"
                if displayRRSE:
                    text+="Root relative squared error        "
                    text+= Utils.doubleToString(self.rootRelativeSquaredError(), 12, 4) + " %\n"
            if self.m_CoverageStatisticsAvailable:
                displayCoverage="coverage" in self.m_metricsToDisplay
                displayRegionSize="region size" in self.m_metricsToDisplay
                if displayCoverage:
                    text+="Coverage of cases " + Utils.doubleToString(self.m_ConfLevel, 4, 2) + " level)     "
                    text+= Utils.doubleToString(self.coverageOfTestCasesByPredictedRegions(), 12, 4) + " %\n"
                if not self.m_NoPriors:
                    if displayRegionSize:
                        text+="Mean rel. region size (" + Utils.doubleToString(self.m_ConfLevel, 4, 2) + " level) "
                        text+= Utils.doubleToString(self.sizeOfPredictedRegions(), 12, 4) + " %\n"
        if Utils.gr(self.unclassified(), 0):
            text+="UnClassified Instances             "
            text+= Utils.doubleToString(self.unclassified(), 12, 4) + "     " + Utils.doubleToString(self.pctUnclassified(), 12, 4) + " %\n"
        text+="Total Number of Instances          "
        text+= Utils.doubleToString(self.m_WithClass, 12, 4) + "\n"
        if self.m_MissingClass>0:
            text+="Ignored Class Unknown Instances            "
            text+= Utils.doubleToString(self.m_MissingClass, 12, 4) + "\n"
        return text