def prune(self): if not self.m_isLeaf: for i in range(len(self.m_sons)): self.son(i).prune() indexOfLargestBranch = self.localModel().distribution().maxBag() if self.m_subtreeRaising: errorsLargestBranch = self.son( indexOfLargestBranch).getEstimatedErrorsForBranch( self.m_train) else: errorsLargestBranch = float("inf") errorsLeaf = self.getEstimatedErrorsForDistribution( self.localModel().distribution()) errorsTree = self.getEstimatedErrors() if (Utils.gr(errorsTree+0.1, errorsLeaf) or Utils.equal(errorsTree+0.1, errorsLeaf)) and\ (Utils.gr(errorsLargestBranch+0.1, errorsLeaf) or Utils.equal(errorsLargestBranch+0.1, errorsLeaf)): self.m_sons = None self.m_isLeaf = True self.m_localModel = NoSplit(self.localModel().distribution()) return if Utils.gr(errorsTree + 0.1, errorsLargestBranch) or Utils.equal( errorsTree + 0.1, errorsLargestBranch): largestBranch = self.son(indexOfLargestBranch) self.m_sons = largestBranch.m_sons self.m_localModel = largestBranch.localModel() self.m_isLeaf = largestBranch.m_isLeaf self.newDistribution(self.m_train) self.prune()
def split(self,data:Instances)->List[Instances]: subsetSize=[0]*self.m_numSubsets for inst in data: subset=self.whichSubset(inst) if subset > -1: subsetSize[subset]+=1 else: weights=self.weights(inst) for j in range(self.m_numSubsets): if Utils.gr(weights[j], 0): subsetSize[j]+=1 instances=[] #type:List[Instances] for j in range(self.m_numSubsets): instances.append(Instances(data,subsetSize[j])) for inst in data: subset=self.whichSubset(inst) if subset > -1: instances[subset].add(inst) else: weights=self.weights(inst) for j in range(self.m_numSubsets): if Utils.gr(weights[j], 0): instances[j].add(inst) instances[j].lastInstance().setWeight(float(weights[j]*inst.weight())) return instances
def computeAverageClassValues(self): avgClassValues = [[] for i in range(self.getInputFormat().numAttributes()) ] self.m_Indices = [[] for i in range(self.getInputFormat().numAttributes()) ] for j in range(self.getInputFormat().numAttributes()): att = self.getInputFormat().attribute(j) if att.isNominal(): avgClassValues[j] = [0] * att.numValues() counts = [0] * att.numValues() for i in range(self.getInputFormat().numInstances()): instance = self.getInputFormat().instance(i) if not instance.classIsMissing( ) and not instance.isMissing(j): counts[int(instance.value(j))] += instance.weight() avgClassValues[j][int(instance.value( j))] += instance.weight() * instance.weight() sums = sum(avgClassValues[j]) totalCounts = sum(counts) if Utils.gr(totalCounts, 0): for k in range(att.numValues()): if Utils.gr(counts[k], 0): avgClassValues[j][k] /= counts[k] else: avgClassValues[j][k] = sums / totalCounts self.m_Indices[j] = Utils.sortDouble(avgClassValues[j])
def selectModel(self, data: Instances, test: Instances = None): if test is not None: return self.selectModel(data) multiVal = True averageInfoGain = validModels = 0 checkDistribution = Distribution(data) noSplitModel = NoSplit(checkDistribution) if Utils.gr(2*self.m_minNoObj, checkDistribution.total()) or \ Utils.equal(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass())): return noSplitModel if self.m_allData is not None: for attr in data.enumerateAttributes(): if attr.isNumeric() or Utils.gr( 0.3 * self.m_allData.numInstances(), attr.numValues()): multiVal = False break currentModel = [None] * data.numAttributes() #type:List[C45Split] sumOfWeights = data.sumOfWeight() for i in range(data.numAttributes()): if i != data.classIndex(): currentModel[i] = C45Split(i, self.m_minNoObj, sumOfWeights, self.m_useMDLcorrection) currentModel[i].buildClassifer(data) if currentModel[i].checkModel(): if self.m_allData is not None: if data.attribute(i).isNumeric() or \ (multiVal or Utils.gr(0.3*self.m_allData.numInstances(), data.attribute(i).numValues())): averageInfoGain = averageInfoGain + currentModel[ i].infoGain() validModels += 1 else: averageInfoGain = averageInfoGain + currentModel[ i].infoGain() validModels += 1 else: currentModel[i] = None if validModels == 0: return noSplitModel averageInfoGain = averageInfoGain / validModels minResult = 0 for i in range(data.numAttributes()): if i != data.classIndex() and currentModel[i].checkModel(): if currentModel[i].infoGain() >= averageInfoGain-1e-3 and\ Utils.gr(currentModel[i].gainRatio(), minResult): bestModel = currentModel[i] minResult = currentModel[i].gainRatio() if Utils.equal(minResult, 0): return noSplitModel bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex()) if self.m_allData is not None and not self.m_doNotMakeSplitPointActualValue: bestModel.setSplitPoint(self.m_allData) return bestModel
def splitCritValue(self, bags: Distribution, totalNoInst: float = None, numerator: float = None): if totalNoInst is None and numerator is None: numerator = self.oldEnt(bags) - self.newEnt(bags) if Utils.equal(numerator, 0): return float('inf') denumerator = self.splitEnt(bags) if Utils.equal(denumerator, 0): return float('inf') return denumerator / numerator elif numerator is None: res = 0 noUnkown = totalNoInst - bags.total() if Utils.gr(bags.total(), 0): for i in range(bags.numBags()): res = res - self.lnFunc(bags.perBag(i)) res = res - self.lnFunc(noUnkown) res = res + self.lnFunc(totalNoInst) return res / math.log(2) else: denumerator = self.splitEnt(bags, totalNoInst) if Utils.equal(denumerator, 0): return 0 denumerator /= totalNoInst return numerator / denumerator
def buildClassifier(self, instances: Instances): self.getCapabilities().testWithFail(instances) sumOfWeights = 0 self.m_Class = instances.classAttribute() self.m_ClassValue = 0 attrType = instances.classAttribute().type() if attrType == Attribute.NUMERIC: self.m_Counts = None elif attrType == Attribute.NOMINAL: self.m_Counts = [] for i in range(instances.numClasses()): self.m_Counts.append(1) sumOfWeights = instances.numClasses() for instance in instances: classValue = instance.classValue() if not Utils.isMissingValue(classValue): if instances.classAttribute().isNominal(): self.m_Counts[classValue] += instance.weight() else: self.m_ClassValue += instance.weight() * classValue sumOfWeights += instance.weight() if instances.classAttribute().isNumeric(): if Utils.gr(sumOfWeights, 0): self.m_ClassValue /= sumOfWeights else: self.m_ClassValue = Utils.maxIndex(self.m_Counts) Utils.normalize(self.m_Counts, sumOfWeights)
def maxClass(self, index: int = None): maxCount = 0 maxIndex = 0 if index is None: for i in range(len(self.m_perClass)): if Utils.gr(self.m_perClass[i], maxCount): maxCount = self.m_perClass[i] maxIndex = i return maxIndex else: if Utils.gr(self.m_perBag[index], 0): for i in range(len(self.m_perClass)): if Utils.gr(self.m_perClassPerBag[index][i], maxCount): maxCount = self.m_perClassPerBag[index][i] maxIndex = i return maxIndex return self.maxClass()
def dumpLabel(self,index:int,data:Instances): text="" text+=data.classAttribute().value(self.m_distribution.maxClass(index)) text+=" ("+str(Utils.roundDouble(self.m_distribution.perBag(index), 2)) if Utils.gr(self.m_distribution.numIncorrect(index), 0): text+="/"+str(Utils.roundDouble(self.m_distribution.numIncorrect(index), 2)) text+=")" return text
def handleNumericAttribute(self, trainInstances: Instances): next = 1 last = 0 splitIndex = -1 self.m_distribution = Distribution(2, trainInstances.numClasses()) i = 0 for inst in trainInstances: if inst.isMissing(self.m_attIndex): break self.m_distribution.add(1, inst) i += 1 firstMiss = i minSplit = 0.1 * self.m_distribution.total( ) / trainInstances.numClasses() if Utils.gr(self.m_minNoObj, minSplit) or Utils.equal( minSplit, self.m_minNoObj): minSplit = self.m_minNoObj elif Utils.gr(minSplit, 25): minSplit = 25 if Utils.gr(2 * minSplit, firstMiss): return defaultEnt = self.infoGainCrit.oldEnt(self.m_distribution) print("dfalut", defaultEnt) while next < firstMiss: if trainInstances.instance(next - 1).value( self.m_attIndex) + 1e-5 < trainInstances.instance( next).value(self.m_attIndex): self.m_distribution.shiftRange(1, 0, trainInstances, last, next) if (Utils.gr(self.m_distribution.perBag(0), minSplit) or Utils.equal(self.m_distribution.perBag(0), minSplit))\ and (Utils.gr(self.m_distribution.perBag(1), minSplit) or Utils.equal(self.m_distribution.perBag(1), minSplit)): currentInfoGain = self.infoGainCrit.splitCritValue( self.m_distribution, self.m_sumOfWeights, defaultEnt) if Utils.gr(currentInfoGain, self.m_infoGain): self.m_infoGain = currentInfoGain splitIndex = next - 1 self.m_index += 1 last = next next += 1 if self.m_index == 0: return if self.m_useMDLcorrection: self.m_infoGain = self.m_infoGain - (Utils.log2(self.m_index) / self.m_sumOfWeights) if Utils.gr(0, self.m_infoGain) or Utils.equal(0, self.m_infoGain): return self.m_numSubsets = 2 self.m_splitPoint = ( trainInstances.instance(splitIndex + 1).value(self.m_attIndex) + trainInstances.instance(splitIndex).value(self.m_attIndex)) / 2 if self.m_splitPoint == trainInstances.instance(splitIndex + 1).value( self.m_attIndex): self.m_splitPoint = trainInstances.instance(splitIndex).value( self.m_attIndex) self.m_distribution = Distribution(2, trainInstances.numClasses()) self.m_distribution.addRange(0, trainInstances, 0, splitIndex + 1) self.m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss) self.m_gainRatio = self.gainRatioCrit.splitCritValue( self.m_distribution, self.m_sumOfWeights, self.m_infoGain)
def maxBag(self): max = 0 maxIndex = -1 for i in range(len(self.m_perBag)): if Utils.gr(self.m_perBag[i], max) or Utils.equal( self.m_perBag[i], max): max = self.m_perBag[i] maxIndex = i return maxIndex
def check(self, minNoObj: float): counter = 0 for i in range(len(self.m_perBag)): if Utils.gr(self.m_perBag[i], minNoObj) or Utils.equal( self.m_perBag[i], minNoObj): counter += 1 if counter > 1: return True return False
def laplaceProb(self, classIndex: int, intIndex: int = None): if intIndex is None: return (self.m_perClass[classIndex] + 1) / (self.totaL + len(self.m_perClass)) else: if Utils.gr(self.m_perBag[intIndex], 0): return (self.m_perClassPerBag[intIndex][classIndex] + 1) / (self.m_perBag[intIndex] + len(self.m_perClass)) return self.laplaceProb(classIndex)
def prune(self): if not self.m_isLeaf: for i in range(len(self.m_sons)): self.son(i).prune() if Utils.gr(self.errorsForTree(), self.errorsForLeaf()) or Utils.equal( self.errorsForTree(), self.errorsForLeaf()): self.m_sons = None self.m_isLeaf = None self.m_localModel = NoSplit(self.localModel().distribution())
def prob(self, classIndex: int, intIndex: int = None): if intIndex is None: if not Utils.equal(self.totaL, 0): return self.m_perClass[classIndex] / self.totaL return 0 else: if Utils.gr(self.m_perBag[intIndex], 0): return self.m_perClassPerBag[intIndex][ classIndex] / self.m_perBag[intIndex] return self.prob(classIndex)
def splitEnt(self, bags: Distribution, totalnoInst: float = None): if totalnoInst is None: return super().splitEnt(bags) res = 0 noUnknown = totalnoInst - bags.total() if Utils.gr(bags.total(), 0): for i in range(bags.numBags()): res = res - self.lnFunc(bags.perBag(i)) res = res - self.lnFunc(noUnknown) res = res + self.lnFunc(totalnoInst) return res / math.log(2)
def handleEnumeratedAttribute(self, instances: Instances): numAttValues = instances.attribute(self.m_attIndex).numValues() newDistribution = Distribution(numAttValues, instances.numClasses()) for inst in instances: if not inst.isMissing(self.m_attIndex): newDistribution.add(int(inst.value(self.m_attIndex)), inst) self.m_distribution = newDistribution for i in range(numAttValues): if Utils.gr(newDistribution.perBag(i), self.m_minNoObj) or\ Utils.equal(newDistribution.perBag(i), self.m_minNoObj): secondDistribution = Distribution(newDistribution, i) if secondDistribution.check(self.m_minNoObj): self.m_numSubsets = 2 currIG = self.infoGainCrit.splitCritValue( secondDistribution, self.m_sumOfWeights) currGR = self.gainRatioCrit.splitCritValue( secondDistribution, self.m_sumOfWeights, currIG) if i == 0 or Utils.gr(currGR, self.m_gainRatio): self.m_gainRatio = currGR self.m_infoGain = currIG self.m_splitPoint = i self.m_distribution = secondDistribution
def batchFinished(self): if self.getInputFormat() is None: raise Exception("No input instance format defined") if self.m_ModesAndMeans is None: sumOfWeights=self.getInputFormat().sumOfWeight() counts=[[] for k in range(self.getInputFormat().numAttributes())] for i in range(self.getInputFormat().numAttributes()): if self.getInputFormat().attribute(i).isNominal(): counts[i]=[0]*self.getInputFormat().attribute(i).numValues() if len(counts[i]) > 0: counts[i][0]=sumOfWeights sums=[] for i in range(self.getInputFormat().numAttributes()): sums.append(sumOfWeights) results=[0]*self.getInputFormat().numAttributes() for j in range(self.getInputFormat().numInstances()): inst=self.getInputFormat().instance(j) for i in range(inst.numValues()): if not inst.isMissingSparse(i): value=inst.valueSparse(i) if inst.attributeSparse(i).isNominal(): if len(counts[inst.index(i)]) > 0: counts[inst.index(i)][int(value)]+=inst.weight() counts[inst.index(i)][0]-=inst.weight() elif inst.attributeSparse(i).isNumeric(): results[inst.index(i)]+=inst.weight()*inst.valueSparse(i) else: if inst.attributeSparse(i).isNominal(): if len(counts[inst.index(i)]) > 0 : counts[inst.index(i)][0]-=inst.weight() elif inst.attributeSparse(i).isNumeric(): sums[inst.index(i)]-=inst.weight() self.m_ModesAndMeans=[0]*self.getInputFormat().numAttributes() for i in range(self.getInputFormat().numAttributes()): if self.getInputFormat().attribute(i).isNominal(): if len(counts[i]) == 0: self.m_ModesAndMeans[i]= Utils.missingValue() else: self.m_ModesAndMeans[i]= Utils.maxIndex(counts[i]) elif self.getInputFormat().attribute(i).isNumeric(): if Utils.gr(sums[i], 0): self.m_ModesAndMeans[i]=results[i]/sums[i] for i in range(self.getInputFormat().numInstances()): self.convertInstance(self.getInputFormat().instance(i)) self.flushInput() self.m_NewBatch=True return self.numPendingOutput() != 0
def toSummaryString(self,printComplexityStatistics:bool,title:str="=== Summary ===\n"): if printComplexityStatistics and self.m_NoPriors: printComplexityStatistics=False text=title+'\n' if self.m_WithClass > 0: if self.m_ClassIsNominal: displayCorrect="correct" in self.m_metricsToDisplay displayIncorrect="incorrect" in self.m_metricsToDisplay displayKappa="kappa" in self.m_metricsToDisplay if displayCorrect: text+="Correctly Classified Instances " text+= Utils.doubleToString(self.correct(), 12, 4) + " " + Utils.doubleToString(self.pctCorrect(), 12, 4) + " %\n" if displayIncorrect: text+="Incorrectly Classified Instances " text+= Utils.doubleToString(self.incorrect(), 12, 4) + " " + Utils.doubleToString(self.pctIncorrect(), 12, 4) + " %\n" if displayKappa: text+="Kappa statistic " text+= Utils.doubleToString(self.kappa(), 12, 4) + "\n" if printComplexityStatistics: displayKBRelative="kb relative" in self.m_metricsToDisplay displayKBInfo="kb information" in self.m_metricsToDisplay if displayKBRelative: text+="K&B Relative Info Score " text+= Utils.doubleToString(self.KBRelativeInformation(), 12, 4) + " %\n" if displayKBInfo: text+="K&B Information Score " text+= Utils.doubleToString(self.KBInformation(), 12, 4) + " bits" text+= Utils.doubleToString(self.KBMeanInformation(), 12, 4) + " bits/instance\n" #if self.m_pluginMetrics != null: else: displayCorrelation="correlation" in self.m_metricsToDisplay if displayCorrelation: text+="Correlation coefficient " text+= Utils.doubleToString(self.correlationCoefficient(), 12, 4) + "\n" # if self.m_pluginMetrics != null: if printComplexityStatistics and self.m_ComplexityStatisticsAvailable: displayComplexityOrder0="complexity 0" in self.m_metricsToDisplay displayComplexityScheme="complexity scheme" in self.m_metricsToDisplay displayComplexityImprovement="complexity improvement" in self.m_metricsToDisplay if displayComplexityOrder0: text+="Class complexity | order 0 " text+= Utils.doubleToString(self.SFPriorEntropy(), 12, 4) + " bits" text+= Utils.doubleToString(self.SFMeanPriorEntropy(), 12, 4) + " bits/instance\n" if displayComplexityScheme: text+="Class complexity | scheme " text+= Utils.doubleToString(self.SFSchemeEntropy(), 12, 4) + " bits" text+= Utils.doubleToString(self.SFMeanSchemeEntropy(), 12, 4) + " bits/instance\n" if displayComplexityImprovement: text+="Complexity improvement (Sf) " text+= Utils.doubleToString(self.SFEntropyGain(), 12, 4) + " bits" text+= Utils.doubleToString(self.SFMeanEntropyGain(), 12, 4) + " bits/instance\n" displayMAE = "mae" in self.m_metricsToDisplay displayRMSE = "rmse" in self.m_metricsToDisplay displayRAE = "rae" in self.m_metricsToDisplay displayRRSE = "rrse" in self.m_metricsToDisplay if displayMAE: text+="Mean absolute error " text+= Utils.doubleToString(self.meanAbsoluteError(), 12, 4) + "\n" if displayRMSE: text+="Root mean squared error " text+= Utils.doubleToString(self.rootMeanSquaredError(), 12, 4) + "\n" if not self.m_NoPriors: if displayRAE: text+="Relative absolute error " text+= Utils.doubleToString(self.relativeAbsoluteError(), 12, 4) + " %\n" if displayRRSE: text+="Root relative squared error " text+= Utils.doubleToString(self.rootRelativeSquaredError(), 12, 4) + " %\n" if self.m_CoverageStatisticsAvailable: displayCoverage="coverage" in self.m_metricsToDisplay displayRegionSize="region size" in self.m_metricsToDisplay if displayCoverage: text+="Coverage of cases " + Utils.doubleToString(self.m_ConfLevel, 4, 2) + " level) " text+= Utils.doubleToString(self.coverageOfTestCasesByPredictedRegions(), 12, 4) + " %\n" if not self.m_NoPriors: if displayRegionSize: text+="Mean rel. region size (" + Utils.doubleToString(self.m_ConfLevel, 4, 2) + " level) " text+= Utils.doubleToString(self.sizeOfPredictedRegions(), 12, 4) + " %\n" if Utils.gr(self.unclassified(), 0): text+="UnClassified Instances " text+= Utils.doubleToString(self.unclassified(), 12, 4) + " " + Utils.doubleToString(self.pctUnclassified(), 12, 4) + " %\n" text+="Total Number of Instances " text+= Utils.doubleToString(self.m_WithClass, 12, 4) + "\n" if self.m_MissingClass>0: text+="Ignored Class Unknown Instances " text+= Utils.doubleToString(self.m_MissingClass, 12, 4) + "\n" return text