def attributeStats(self, index: int) -> AttributeStats: result = AttributeStats() if self.attribute(index).isNominal(): result.nominalCounts = [0] * self.attribute(index).numValues() result.nominalWeights = [0] * self.attribute(index).numValues() if self.attribute(index).isNumeric(): result.numericStats = Stats() result.totalCount = self.numInstances() map = dict() for current in self.m_Instances: key = current.value(index) if Utils.isMissingValue(key): result.missingCount += 1 else: values = map.get(key) if values is None: values = [1.0, 1.0] map[key] = values else: values[0] += 1.0 #values[1]=values[1]+current.weight() values[1] += 1.0 for key, val in map.items(): result.addDistinct(key, val[0], val[1]) return result
def buildClassifier(self, instances: Instances): self.getCapabilities().testWithFail(instances) sumOfWeights = 0 self.m_Class = instances.classAttribute() self.m_ClassValue = 0 attrType = instances.classAttribute().type() if attrType == Attribute.NUMERIC: self.m_Counts = None elif attrType == Attribute.NOMINAL: self.m_Counts = [] for i in range(instances.numClasses()): self.m_Counts.append(1) sumOfWeights = instances.numClasses() for instance in instances: classValue = instance.classValue() if not Utils.isMissingValue(classValue): if instances.classAttribute().isNominal(): self.m_Counts[classValue] += instance.weight() else: self.m_ClassValue += instance.weight() * classValue sumOfWeights += instance.weight() if instances.classAttribute().isNumeric(): if Utils.gr(sumOfWeights, 0): self.m_ClassValue /= sumOfWeights else: self.m_ClassValue = Utils.maxIndex(self.m_Counts) Utils.normalize(self.m_Counts, sumOfWeights)
def difference(self, index: int, val1: float, val2: float): if self.m_Data.attribute(index).type() == Attribute.NOMINAL: if Utils.isMissingValue(val1) or Utils.isMissingValue( val2) or int(val1) != int(val2): return 1 return 0 elif self.m_Data.attribute(index).type() == Attribute.NUMERIC: if Utils.isMissingValue(val1) or Utils.isMissingValue(val2): if Utils.isMissingValue(val1) and Utils.isMissingValue(val2): if not self.m_DontNormalize: return 1 return self.m_Ranges[index][self.R_WIDTH] else: if Utils.isMissingValue(val2): diff = self.norm( val1, index) if not self.m_DontNormalize else val1 else: diff = self.norm( val2, index) if not self.m_DontNormalize else val2 if not self.m_DontNormalize and diff < 0.5: diff = 1 - diff elif self.m_DontNormalize: if (self.m_Ranges[index][self.R_MAX] - diff) > ( diff - self.m_Ranges[index][self.R_MIN]): return self.m_Ranges[index][self.R_MAX] - diff else: return diff - self.m_Ranges[index][self.R_MIN] return diff else: if not self.m_DontNormalize: return self.norm(val1, index) - self.norm(val2, index) return val1 - val2 else: return 0
def makeDistribution(self,predictedClass:float): result=[0]*self.m_NumClasses if Utils.isMissingValue(predictedClass): return result if self.m_ClassIsNominal: result[int(predictedClass)]=1 else: result[0]=predictedClass return result
def setSplitPoint(self, allInstances: Instances): newSplitPoint = float("-inf") if allInstances.attribute( self.m_attIndex).isNumeric() and self.m_numSubsets > 1: for i in range(allInstances.numInstances()): instance = allInstances.instance(i) tempValue = instance.value(self.m_attIndex) if not Utils.isMissingValue(tempValue): if tempValue > newSplitPoint and tempValue <= self.m_splitPoint: newSplitPoint = tempValue self.m_splitPoint = newSplitPoint
def distributionForInstance(self,instance:Instance)->List[float]: dist=[0]*instance.numClasses() if instance.classAttribute().type() == Attribute.NOMINAL: classification=self.classifyInstance(instance) if Utils.isMissingValue(classification): return dist else: dist[int(classification)]=1.0 return dist elif instance.classAttribute().type() == Attribute.NUMERIC or instance.classAttribute().type() == Attribute.DATE: dist[0]=self.classifyInstance(instance) return dist return dist
def updateStatsForPredictor(self,predictedValue:float,instance:Instance): if not instance.classIsMissing(): self.m_WithClass+=instance.weight() if Utils.isMissingValue(predictedValue): self.m_Unclassified+=instance.weight() return self.m_SumClass+=instance.weight()*instance.classValue() self.m_SumSqrClass+=instance.weight()*instance.classValue()*instance.classValue() self.m_SumClassPredicted+=instance.weight()*instance.classValue()*predictedValue self.m_SumPredicted+=instance.weight()*predictedValue self.m_SumSqrPredicted+=instance.weight()*predictedValue*predictedValue self.updateNumericScores(self.makeDistribution(predictedValue),self.makeDistribution(instance.classValue()),instance.weight()) else: self.m_MissingClass+=instance.weight()
def sort(self, attrIndex=None): if isinstance(attrIndex, Attribute): attrIndex = attrIndex.index() if not self.attribute(attrIndex).isNominal(): vals = [0.0] * self.numInstances() backup = [None] * self.numInstances() #type:List[Instance] for i in range(len(vals)): inst = self.instance(i) backup[i] = inst val = inst.value(attrIndex) if Utils.isMissingValue(val): vals[i] = float('inf') else: vals[i] = val sortOrder = Utils.sortWithNoMissingValues(vals) for i in range(len(vals)): self.m_Instances[i] = backup[sortOrder[i]] else: self.sortBasedOnNominalAttribute(attrIndex)
def evaluationForSingleInstance(self, a0, instance:Instance, storePredictions:bool): if isinstance(a0,List): if self.m_ClassIsNominal: pred= Utils.maxIndex(a0) if a0[int(pred)] <= 0: pred= Utils.missingValue() self.updateStatsForClassifier(a0, instance) if storePredictions and not self.m_DiscardPredictions: if self.m_Predictions is None: self.m_Predictions=[] self.m_Predictions.append(NominalPrediction(instance.classValue(), a0, instance.weight())) else: pred=a0[0] self.updateStatsForPredictor(pred,instance) if storePredictions and not self.m_DiscardPredictions: if self.m_Predictions is None: self.m_Predictions=[] self.m_Predictions.append(NumericPrediction(instance.classValue(),pred,instance.weight())) return pred elif isinstance(a0,Classifier): classMissing=copy.deepcopy(instance) classMissing.setDataset(instance.dataset()) #TODO # if isinstance(a0,InputMappedClassifier) # else: classMissing.setClassMissing() # print("isMiss: ", instance.value(5)) pred=self.evaluationForSingleInstance(a0.distributionForInstance(classMissing),instance,storePredictions) if not self.m_ClassIsNominal: if not instance.classIsMissing() and not Utils.isMissingValue(pred): if isinstance(a0,IntervalEstimator): self.updateStatsForIntervalEstimator(a0,classMissing,instance.classValue()) else: self.m_CoverageStatisticsAvailable=False if isinstance(a0,ConditionalDensityEstimator): self.updateStatsForConditionalDensityEstimator(a0,classMissing,instance.classValue()) else: self.m_ComplexityStatisticsAvailable=False return pred
def toClassDetailsString(self,title:str="=== Detailed Accuracy By Class ===\n"): if not self.m_ClassIsNominal: raise Exception("Evaluation: No per class statistics possible!") displayTP = "tp rate" in self.m_metricsToDisplay displayFP ="fp rate" in self.m_metricsToDisplay displayP = "precision" in self.m_metricsToDisplay displayR = "recall" in self.m_metricsToDisplay displayFM = "f-measure" in self.m_metricsToDisplay displayMCC ="mcc" in self.m_metricsToDisplay displayROC = "roc area" in self.m_metricsToDisplay displayPRC ="prc area" in self.m_metricsToDisplay text=title+"\n "\ + ("TP Rate " if displayTP else "") + ("FP Rate " if displayFP else "")\ + ("Precision " if displayP else "") + ("Recall " if displayR else "")\ + ("F-Measure " if displayFM else "") + ("MCC " if displayMCC else "")\ + ("ROC Area " if displayROC else "") + ("PRC Area " if displayPRC else "")\ + "Class\n" for i in range(self.m_NumClasses): text+=" " if displayTP: tpr=self.truePositiveRate(i) if Utils.isMissingValue(tpr): text+="? " else: text+="{:<12.3f}".format(tpr) if displayFP: fpr=self.falsePositiveRate(i) if Utils.isMissingValue(fpr): text+="? " else: text+="{:<12.3f}".format(fpr) if displayP: p=self.precision(i) if Utils.isMissingValue(p): text+="? " else: text+="{:<12.3f}".format(p) if displayR: r=self.recall(i) if Utils.isMissingValue(r): text+="? " else: text+="{:<12.3f}".format(r) if displayFM: fm=self.fMeasure(i) if Utils.isMissingValue(fm): text+="? " else: text+="{:<12.3f}".format(fm) if displayMCC: mat=self.matthewsCorrelationCoefficient(i) if Utils.isMissingValue(mat): text+="? " else: text+="{:<12.3f}".format(mat) if displayROC: rocVal=self.areaUnderROC(i) if Utils.isMissingValue(rocVal): text += "? " else: text+="{:<12.3f}".format(rocVal) if displayPRC: prcVal=self.areaUnderPRC(i) if Utils.isMissingValue(prcVal): text += "? " else: text+="{:<12.3f}".format(prcVal) text+=self.m_ClassNames[i]+"\n" text+="Weighted Avg. " if displayTP: wtpr=self.weightedTruePositiveRate() if Utils.isMissingValue(wtpr): text+="? " else: text+="{:<12.3f}".format(wtpr) if displayFP: wfpr=self.weightedFalsePositiveRate() if Utils.isMissingValue(wfpr): text+="? " else: text+="{:<12.3f}".format(wfpr) if displayP: wp=self.weightedPrecision() if Utils.isMissingValue(wp): text+="? " else: text+="{:<12.3f}".format(wp) if displayR: wr=self.weightedRecall() if Utils.isMissingValue(wr): text+="? " else: text+="{:<12.3f}".format(wr) if displayFM: wf=self.weightedFMeasure() if Utils.isMissingValue(wf): text+="? " else: text+="{:<12.3f}".format(wf) if displayMCC: wmc=self.weightedMatthewsCorrelation() if Utils.isMissingValue(wmc): text+="? " else: text+="{:<12.3f}".format(wmc) if displayROC: wroc=self.weightedAreaUnderROC() if Utils.isMissingValue(wroc): text+="? " else: text+="{:<12.3f}".format(wroc) if displayPRC: wprc=self.weightedAreaUnderPRC() if Utils.isMissingValue(wprc): text+="? " else: text+="{:<12.3f}".format(wprc) text+="\n" return text
def run(self): self.m_panel.mutex.lock() if self.m_panel.m_classIndex >= 0 and self.m_panel.m_data.attribute( self.m_panel.m_classIndex).isNominal(): intervalWidth = 3.49 * self.m_panel.m_as.numericStats.stdDev * math.pow( self.m_panel.m_data.numInstances(), -1 / 3) intervals = max( 1, int( round( (self.m_panel.m_as.numericStats.max - self.m_panel.m_as.numericStats.min) / intervalWidth))) # print(self.m_panel.m_Painter.width()) if intervals > self.m_panel.m_Painter.width(): #像素填充 intervals = self.m_panel.m_Painter.width() - 6 if intervals < 1: intervals = 1 histClassCounts = [[0] * (self.m_panel.m_data.attribute( self.m_panel.m_classIndex).numValues() + 1) for i in range(intervals)] Utils.debugOut("max", self.m_panel.m_as.numericStats.max) Utils.debugOut("min", self.m_panel.m_as.numericStats.min) Utils.debugOut("intervalWidth", intervalWidth) Utils.debugOut("len", len(histClassCounts)) Utils.debugOut("histClasCount:", histClassCounts) barRange = ( self.m_panel.m_as.numericStats.max - self.m_panel.m_as.numericStats.min) / len(histClassCounts) self.m_panel.m_maxValue = 0 if len(self.m_panel.m_colorList) == 0: self.m_panel.m_colorList.append("black") for i in range( len(self.m_panel.m_colorList), self.m_panel.m_data.attribute( self.m_panel.m_classIndex).numValues() + 1): colorStr = AttributeVisualizationPanel.m_colorNames[(i - 1) % 10] self.m_panel.m_colorList.append(colorStr) for k in range(self.m_panel.m_data.numInstances()): if not self.m_panel.m_data.instance(k).isMissing( self.m_panel.m_attrIndex): t = int( math.ceil( (self.m_panel.m_data.instance(k).value( self.m_panel.m_attrIndex) - self.m_panel.m_as.numericStats.min) / barRange)) if t == 0: if self.m_panel.m_data.instance(k).isMissing( self.m_panel.m_classIndex): histClassCounts[t][ 0] += self.m_panel.m_data.instance(k).weight() else: histClassCounts[t][int( self.m_panel.m_data.instance(k).value( self.m_panel.m_classIndex) + 1)] += self.m_panel.m_data.instance( k).weight() else: if self.m_panel.m_data.instance(k).isMissing( self.m_panel.m_classIndex): histClassCounts[t - 1][ 0] += self.m_panel.m_data.instance(k).weight() else: histClassCounts[t - 1][int( self.m_panel.m_data.instance(k).value( self.m_panel.m_classIndex) + 1)] += self.m_panel.m_data.instance( k).weight() for histClassCount in histClassCounts: sum = 0 for element in histClassCount: sum += element if self.m_panel.m_maxValue < sum: self.m_panel.m_maxValue = sum histClassCountsSparse = [None] * len(histClassCounts) for i in range(len(histClassCounts)): numSparseValues = 0 for j in range(len(histClassCounts[i])): if histClassCounts[i][j] > 0: numSparseValues += 1 sparseValues = [0] * numSparseValues sparseIndices = [0] * numSparseValues count = 0 for j in range(len(histClassCounts[i])): if histClassCounts[i][j] > 0: sparseValues[count] = histClassCounts[i][j] sparseIndices[count] = j count += 1 tempSparse = SparseInstance(1.0, sparseValues, sparseIndices, len(histClassCounts[i])) histClassCountsSparse[i] = tempSparse self.m_panel.m_histBarClassCounts = histClassCountsSparse self.m_panel.m_barRange = barRange else: intervalWidth = 3.49 * self.m_panel.m_as.numericStats.stdDev * math.pow( self.m_panel.m_data.numInstances(), -1 / 3) if Utils.isMissingValue(intervalWidth): intervals = 1 else: intervals = max( 1, round( (self.m_panel.m_as.numericStats.max - self.m_panel.m_as.numericStats.min) / intervalWidth)) if intervals > self.m_panel.m_Painter.width(): intervals = self.m_panel.m_Painter.width() - 6 if intervals < 1: intervals = 1 histCounts = [0] * intervals barRange = (self.m_panel.m_as.numericStats.max - self.m_panel.m_as.numericStats.min) / len(histCounts) self.m_panel.m_maxValue = 0 for k in range(self.m_panel.m_data.numInstances()): if self.m_panel.m_data.instance(k).isMissing( self.m_panel.m_attrIndex): continue t = int( math.ceil((self.m_panel.m_data.instance(k).value( self.m_panel.m_attrIndex) - self.m_panel.m_as.numericStats.min) / barRange)) if t == 0: histCounts[t] += self.m_panel.m_data.instance(k).weight() if histCounts[t] > self.m_panel.m_maxValue: self.m_panel.m_maxValue = histCounts[t] else: histCounts[t - 1] += self.m_panel.m_data.instance(k).weight() if histCounts[t - 1] > self.m_panel.m_maxValue: self.m_panel.m_maxValue = histCounts[t - 1] self.m_panel.m_histBarCounts = histCounts self.m_panel.m_barRange = barRange self.m_panel.m_threadRun = False self.m_panel.m_displayCurrentAttribute = True self.m_panel.m_doneCurrentAttribute = True self.m_panel.paint() self.m_panel.mutex.unlock()
def process(self, toPredict: Instance, classifier: Classifier, evaluation: Evaluation): probActual = probNext = pred = 0 classMissing = copy.deepcopy(toPredict) classMissing.setDataset(toPredict.dataset()) if toPredict.classAttribute().isNominal(): #返回分类预测的概率分布 preds = classifier.distributionForInstance(classMissing) #若概率全部为0,则表示不属于任何一类 val = 0 if sum(preds) == 0: pred = Utils.missingValue() probActual = Utils.missingValue() else: #分类结果为概率最大的一项下标 pred = Utils.maxIndex(preds) if not Utils.isMissingValue(toPredict.classIndex()): #如果值不缺失,表示非预测样本,不做修改 if not Utils.isMissingValue(toPredict.classValue()): val = int(toPredict.classValue()) probActual = preds[val] else: probActual = preds[Utils.maxIndex(preds)] for i in range(toPredict.classAttribute().numValues()): if i != val and preds[i] > probNext: probNext = preds[i] evaluation.evaluationForSingleInstance(preds, toPredict, True) else: #单项评估 pred = evaluation.evaluateModelOnceAndRecordPrediction( classifier, toPredict) if not self.m_SaveForVisualization: return #保存可视化数据 if self.m_PlotInstances is not None: isNominal = toPredict.classAttribute().isNominal() values = [0] * self.m_PlotInstances.numAttributes() i = 0 while i < self.m_PlotInstances.numAttributes(): #预测值前的所有值照原来的拷贝 if i < toPredict.classIndex(): values[i] = toPredict.value(i) elif i == toPredict.classIndex(): if isNominal: #首选结果与备选结果的差值 values[i] = probActual - probNext #预测结果 values[i + 1] = pred #原始值 values[i + 2] = toPredict.value(i) i += 2 else: values[i] = pred values[i + 1] = toPredict.value(i) i += 1 else: if isNominal: values[i] = toPredict.value(i - 2) else: values[i] = toPredict.value(i - 1) i += 1 # print("============") # for m in values: # print("val:",m) # print("============") self.m_PlotInstances.add(Instance(1.0, values)) if toPredict.classAttribute().isNominal(): if toPredict.isMissing( toPredict.classIndex()) or Utils.isMissingValue(pred): self.m_PlotShapes.append(Plot2D.MISSING_SHAPE) elif pred != toPredict.classValue(): self.m_PlotShapes.append(Plot2D.ERROR_SHAPE) else: self.m_PlotShapes.append(Plot2D.CONST_AUTOMATIC_SHAPE) if self.m_pointSizeProportionalToMargin: self.m_PlotSizes.append(probActual - probNext) else: sizeAdj = 0 if pred != toPredict.classValue(): sizeAdj = 1 self.m_PlotSizes.append(Plot2D.DEFAULT_SHAPE_SIZE.value + sizeAdj) else: errd = None if not toPredict.isMissing(toPredict.classIndex( )) and not Utils.isMissingValue(pred): errd = pred - toPredict.classValue() self.m_PlotShapes.append(Plot2D.CONST_AUTOMATIC_SHAPE) else: self.m_PlotShapes.append(Plot2D.MISSING_SHAPE) self.m_PlotSizes.append(errd)
def isMissingSparse(self, index: int): if Utils.isMissingValue(self.valueSparse(index)): return True return False
def isMissing(self, attrIndex: int): return Utils.isMissingValue(self.m_AttValues[attrIndex])