Пример #1
0
    def attributeStats(self, index: int) -> AttributeStats:
        result = AttributeStats()
        if self.attribute(index).isNominal():
            result.nominalCounts = [0] * self.attribute(index).numValues()
            result.nominalWeights = [0] * self.attribute(index).numValues()
        if self.attribute(index).isNumeric():
            result.numericStats = Stats()
        result.totalCount = self.numInstances()
        map = dict()
        for current in self.m_Instances:
            key = current.value(index)
            if Utils.isMissingValue(key):
                result.missingCount += 1
            else:
                values = map.get(key)
                if values is None:
                    values = [1.0, 1.0]
                    map[key] = values
                else:
                    values[0] += 1.0
                    #values[1]=values[1]+current.weight()
                    values[1] += 1.0

        for key, val in map.items():
            result.addDistinct(key, val[0], val[1])
        return result
Пример #2
0
 def buildClassifier(self, instances: Instances):
     self.getCapabilities().testWithFail(instances)
     sumOfWeights = 0
     self.m_Class = instances.classAttribute()
     self.m_ClassValue = 0
     attrType = instances.classAttribute().type()
     if attrType == Attribute.NUMERIC:
         self.m_Counts = None
     elif attrType == Attribute.NOMINAL:
         self.m_Counts = []
         for i in range(instances.numClasses()):
             self.m_Counts.append(1)
         sumOfWeights = instances.numClasses()
     for instance in instances:
         classValue = instance.classValue()
         if not Utils.isMissingValue(classValue):
             if instances.classAttribute().isNominal():
                 self.m_Counts[classValue] += instance.weight()
             else:
                 self.m_ClassValue += instance.weight() * classValue
             sumOfWeights += instance.weight()
     if instances.classAttribute().isNumeric():
         if Utils.gr(sumOfWeights, 0):
             self.m_ClassValue /= sumOfWeights
     else:
         self.m_ClassValue = Utils.maxIndex(self.m_Counts)
         Utils.normalize(self.m_Counts, sumOfWeights)
Пример #3
0
 def difference(self, index: int, val1: float, val2: float):
     if self.m_Data.attribute(index).type() == Attribute.NOMINAL:
         if Utils.isMissingValue(val1) or Utils.isMissingValue(
                 val2) or int(val1) != int(val2):
             return 1
         return 0
     elif self.m_Data.attribute(index).type() == Attribute.NUMERIC:
         if Utils.isMissingValue(val1) or Utils.isMissingValue(val2):
             if Utils.isMissingValue(val1) and Utils.isMissingValue(val2):
                 if not self.m_DontNormalize:
                     return 1
                 return self.m_Ranges[index][self.R_WIDTH]
             else:
                 if Utils.isMissingValue(val2):
                     diff = self.norm(
                         val1, index) if not self.m_DontNormalize else val1
                 else:
                     diff = self.norm(
                         val2, index) if not self.m_DontNormalize else val2
                 if not self.m_DontNormalize and diff < 0.5:
                     diff = 1 - diff
                 elif self.m_DontNormalize:
                     if (self.m_Ranges[index][self.R_MAX] - diff) > (
                             diff - self.m_Ranges[index][self.R_MIN]):
                         return self.m_Ranges[index][self.R_MAX] - diff
                     else:
                         return diff - self.m_Ranges[index][self.R_MIN]
                 return diff
         else:
             if not self.m_DontNormalize:
                 return self.norm(val1, index) - self.norm(val2, index)
             return val1 - val2
     else:
         return 0
Пример #4
0
 def makeDistribution(self,predictedClass:float):
     result=[0]*self.m_NumClasses
     if Utils.isMissingValue(predictedClass):
         return result
     if self.m_ClassIsNominal:
         result[int(predictedClass)]=1
     else:
         result[0]=predictedClass
     return result
Пример #5
0
 def setSplitPoint(self, allInstances: Instances):
     newSplitPoint = float("-inf")
     if allInstances.attribute(
             self.m_attIndex).isNumeric() and self.m_numSubsets > 1:
         for i in range(allInstances.numInstances()):
             instance = allInstances.instance(i)
             tempValue = instance.value(self.m_attIndex)
             if not Utils.isMissingValue(tempValue):
                 if tempValue > newSplitPoint and tempValue <= self.m_splitPoint:
                     newSplitPoint = tempValue
         self.m_splitPoint = newSplitPoint
Пример #6
0
 def distributionForInstance(self,instance:Instance)->List[float]:
     dist=[0]*instance.numClasses()
     if instance.classAttribute().type() == Attribute.NOMINAL:
         classification=self.classifyInstance(instance)
         if Utils.isMissingValue(classification):
             return dist
         else:
             dist[int(classification)]=1.0
         return dist
     elif instance.classAttribute().type() == Attribute.NUMERIC or instance.classAttribute().type() == Attribute.DATE:
         dist[0]=self.classifyInstance(instance)
         return dist
     return dist
Пример #7
0
 def updateStatsForPredictor(self,predictedValue:float,instance:Instance):
     if not instance.classIsMissing():
         self.m_WithClass+=instance.weight()
         if Utils.isMissingValue(predictedValue):
             self.m_Unclassified+=instance.weight()
             return
         self.m_SumClass+=instance.weight()*instance.classValue()
         self.m_SumSqrClass+=instance.weight()*instance.classValue()*instance.classValue()
         self.m_SumClassPredicted+=instance.weight()*instance.classValue()*predictedValue
         self.m_SumPredicted+=instance.weight()*predictedValue
         self.m_SumSqrPredicted+=instance.weight()*predictedValue*predictedValue
         self.updateNumericScores(self.makeDistribution(predictedValue),self.makeDistribution(instance.classValue()),instance.weight())
     else:
         self.m_MissingClass+=instance.weight()
Пример #8
0
    def sort(self, attrIndex=None):
        if isinstance(attrIndex, Attribute):
            attrIndex = attrIndex.index()
        if not self.attribute(attrIndex).isNominal():
            vals = [0.0] * self.numInstances()
            backup = [None] * self.numInstances()  #type:List[Instance]

            for i in range(len(vals)):
                inst = self.instance(i)
                backup[i] = inst
                val = inst.value(attrIndex)
                if Utils.isMissingValue(val):
                    vals[i] = float('inf')
                else:
                    vals[i] = val

            sortOrder = Utils.sortWithNoMissingValues(vals)
            for i in range(len(vals)):
                self.m_Instances[i] = backup[sortOrder[i]]
        else:
            self.sortBasedOnNominalAttribute(attrIndex)
Пример #9
0
    def evaluationForSingleInstance(self, a0, instance:Instance, storePredictions:bool):
        if isinstance(a0,List):
            if self.m_ClassIsNominal:
                pred= Utils.maxIndex(a0)
                if a0[int(pred)] <= 0:
                    pred= Utils.missingValue()
                self.updateStatsForClassifier(a0, instance)
                if storePredictions and not self.m_DiscardPredictions:
                    if self.m_Predictions is None:
                        self.m_Predictions=[]
                    self.m_Predictions.append(NominalPrediction(instance.classValue(), a0, instance.weight()))
            else:
                pred=a0[0]
                self.updateStatsForPredictor(pred,instance)
                if storePredictions and not self.m_DiscardPredictions:
                    if self.m_Predictions is None:
                        self.m_Predictions=[]
                    self.m_Predictions.append(NumericPrediction(instance.classValue(),pred,instance.weight()))
            return pred
        elif isinstance(a0,Classifier):
            classMissing=copy.deepcopy(instance)
            classMissing.setDataset(instance.dataset())
            #TODO
            # if isinstance(a0,InputMappedClassifier)
            # else:
            classMissing.setClassMissing()
            # print("isMiss: ", instance.value(5))

            pred=self.evaluationForSingleInstance(a0.distributionForInstance(classMissing),instance,storePredictions)
            if not self.m_ClassIsNominal:
                if not instance.classIsMissing() and not Utils.isMissingValue(pred):
                    if isinstance(a0,IntervalEstimator):
                        self.updateStatsForIntervalEstimator(a0,classMissing,instance.classValue())
                    else:
                        self.m_CoverageStatisticsAvailable=False
                    if isinstance(a0,ConditionalDensityEstimator):
                        self.updateStatsForConditionalDensityEstimator(a0,classMissing,instance.classValue())
                    else:
                        self.m_ComplexityStatisticsAvailable=False
            return pred
Пример #10
0
    def toClassDetailsString(self,title:str="=== Detailed Accuracy By Class ===\n"):
        if not self.m_ClassIsNominal:
            raise Exception("Evaluation: No per class statistics possible!")
        displayTP = "tp rate" in self.m_metricsToDisplay
        displayFP ="fp rate" in self.m_metricsToDisplay
        displayP = "precision" in self.m_metricsToDisplay
        displayR = "recall" in self.m_metricsToDisplay
        displayFM = "f-measure" in self.m_metricsToDisplay
        displayMCC ="mcc" in self.m_metricsToDisplay
        displayROC = "roc area" in self.m_metricsToDisplay
        displayPRC ="prc area" in self.m_metricsToDisplay

        text=title+"\n                 "\
        +  ("TP Rate     "   if displayTP else "")   +   ("FP Rate     " if displayFP else "")\
        +  ("Precision   "   if displayP else "")    +   ("Recall      " if displayR else "")\
        +  ("F-Measure   "   if displayFM else "")   +   ("MCC         " if displayMCC else "")\
        +  ("ROC Area    "   if displayROC else "")  +   ("PRC Area    " if displayPRC else "")\
        +  "Class\n"
        for i in range(self.m_NumClasses):
            text+="                 "
            if displayTP:
                tpr=self.truePositiveRate(i)
                if Utils.isMissingValue(tpr):
                    text+="?           "
                else:
                    text+="{:<12.3f}".format(tpr)
            if displayFP:
                fpr=self.falsePositiveRate(i)
                if Utils.isMissingValue(fpr):
                    text+="?           "
                else:
                    text+="{:<12.3f}".format(fpr)
            if displayP:
                p=self.precision(i)
                if Utils.isMissingValue(p):
                    text+="?           "
                else:
                    text+="{:<12.3f}".format(p)
            if displayR:
                r=self.recall(i)
                if Utils.isMissingValue(r):
                    text+="?           "
                else:
                    text+="{:<12.3f}".format(r)
            if displayFM:
                fm=self.fMeasure(i)
                if Utils.isMissingValue(fm):
                    text+="?           "
                else:
                    text+="{:<12.3f}".format(fm)
            if displayMCC:
                mat=self.matthewsCorrelationCoefficient(i)
                if Utils.isMissingValue(mat):
                    text+="?           "
                else:
                    text+="{:<12.3f}".format(mat)
            if displayROC:
                rocVal=self.areaUnderROC(i)
                if Utils.isMissingValue(rocVal):
                    text += "?           "
                else:
                    text+="{:<12.3f}".format(rocVal)
            if displayPRC:
                prcVal=self.areaUnderPRC(i)
                if Utils.isMissingValue(prcVal):
                    text += "?           "
                else:
                    text+="{:<12.3f}".format(prcVal)
            text+=self.m_ClassNames[i]+"\n"
        text+="Weighted Avg.    "

        if displayTP:
            wtpr=self.weightedTruePositiveRate()
            if Utils.isMissingValue(wtpr):
                text+="?           "
            else:
                text+="{:<12.3f}".format(wtpr)
        if displayFP:
            wfpr=self.weightedFalsePositiveRate()
            if Utils.isMissingValue(wfpr):
                text+="?           "
            else:
                text+="{:<12.3f}".format(wfpr)
        if displayP:
            wp=self.weightedPrecision()
            if Utils.isMissingValue(wp):
                text+="?           "
            else:
                text+="{:<12.3f}".format(wp)
        if displayR:
            wr=self.weightedRecall()
            if Utils.isMissingValue(wr):
                text+="?           "
            else:
                text+="{:<12.3f}".format(wr)
        if displayFM:
            wf=self.weightedFMeasure()
            if Utils.isMissingValue(wf):
                text+="?           "
            else:
                text+="{:<12.3f}".format(wf)
        if displayMCC:
            wmc=self.weightedMatthewsCorrelation()
            if Utils.isMissingValue(wmc):
                text+="?           "
            else:
                text+="{:<12.3f}".format(wmc)
        if displayROC:
            wroc=self.weightedAreaUnderROC()
            if Utils.isMissingValue(wroc):
                text+="?           "
            else:
                text+="{:<12.3f}".format(wroc)
        if displayPRC:
            wprc=self.weightedAreaUnderPRC()
            if Utils.isMissingValue(wprc):
                text+="?           "
            else:
                text+="{:<12.3f}".format(wprc)
        text+="\n"
        return text
    def run(self):
        self.m_panel.mutex.lock()

        if self.m_panel.m_classIndex >= 0 and self.m_panel.m_data.attribute(
                self.m_panel.m_classIndex).isNominal():
            intervalWidth = 3.49 * self.m_panel.m_as.numericStats.stdDev * math.pow(
                self.m_panel.m_data.numInstances(), -1 / 3)
            intervals = max(
                1,
                int(
                    round(
                        (self.m_panel.m_as.numericStats.max -
                         self.m_panel.m_as.numericStats.min) / intervalWidth)))

            # print(self.m_panel.m_Painter.width())
            if intervals > self.m_panel.m_Painter.width():
                #像素填充
                intervals = self.m_panel.m_Painter.width() - 6
                if intervals < 1:
                    intervals = 1
            histClassCounts = [[0] * (self.m_panel.m_data.attribute(
                self.m_panel.m_classIndex).numValues() + 1)
                               for i in range(intervals)]
            Utils.debugOut("max", self.m_panel.m_as.numericStats.max)
            Utils.debugOut("min", self.m_panel.m_as.numericStats.min)
            Utils.debugOut("intervalWidth", intervalWidth)
            Utils.debugOut("len", len(histClassCounts))
            Utils.debugOut("histClasCount:", histClassCounts)
            barRange = (
                self.m_panel.m_as.numericStats.max -
                self.m_panel.m_as.numericStats.min) / len(histClassCounts)

            self.m_panel.m_maxValue = 0
            if len(self.m_panel.m_colorList) == 0:
                self.m_panel.m_colorList.append("black")

            for i in range(
                    len(self.m_panel.m_colorList),
                    self.m_panel.m_data.attribute(
                        self.m_panel.m_classIndex).numValues() + 1):
                colorStr = AttributeVisualizationPanel.m_colorNames[(i - 1) %
                                                                    10]
                self.m_panel.m_colorList.append(colorStr)

            for k in range(self.m_panel.m_data.numInstances()):
                if not self.m_panel.m_data.instance(k).isMissing(
                        self.m_panel.m_attrIndex):
                    t = int(
                        math.ceil(
                            (self.m_panel.m_data.instance(k).value(
                                self.m_panel.m_attrIndex) -
                             self.m_panel.m_as.numericStats.min) / barRange))
                    if t == 0:
                        if self.m_panel.m_data.instance(k).isMissing(
                                self.m_panel.m_classIndex):
                            histClassCounts[t][
                                0] += self.m_panel.m_data.instance(k).weight()
                        else:
                            histClassCounts[t][int(
                                self.m_panel.m_data.instance(k).value(
                                    self.m_panel.m_classIndex) +
                                1)] += self.m_panel.m_data.instance(
                                    k).weight()
                    else:
                        if self.m_panel.m_data.instance(k).isMissing(
                                self.m_panel.m_classIndex):
                            histClassCounts[t - 1][
                                0] += self.m_panel.m_data.instance(k).weight()
                        else:
                            histClassCounts[t - 1][int(
                                self.m_panel.m_data.instance(k).value(
                                    self.m_panel.m_classIndex) +
                                1)] += self.m_panel.m_data.instance(
                                    k).weight()

            for histClassCount in histClassCounts:
                sum = 0
                for element in histClassCount:
                    sum += element
                if self.m_panel.m_maxValue < sum:
                    self.m_panel.m_maxValue = sum

            histClassCountsSparse = [None] * len(histClassCounts)
            for i in range(len(histClassCounts)):
                numSparseValues = 0
                for j in range(len(histClassCounts[i])):
                    if histClassCounts[i][j] > 0:
                        numSparseValues += 1
                sparseValues = [0] * numSparseValues
                sparseIndices = [0] * numSparseValues
                count = 0
                for j in range(len(histClassCounts[i])):
                    if histClassCounts[i][j] > 0:
                        sparseValues[count] = histClassCounts[i][j]
                        sparseIndices[count] = j
                        count += 1
                tempSparse = SparseInstance(1.0, sparseValues, sparseIndices,
                                            len(histClassCounts[i]))
                histClassCountsSparse[i] = tempSparse

            self.m_panel.m_histBarClassCounts = histClassCountsSparse
            self.m_panel.m_barRange = barRange
        else:
            intervalWidth = 3.49 * self.m_panel.m_as.numericStats.stdDev * math.pow(
                self.m_panel.m_data.numInstances(), -1 / 3)
            if Utils.isMissingValue(intervalWidth):
                intervals = 1
            else:
                intervals = max(
                    1,
                    round(
                        (self.m_panel.m_as.numericStats.max -
                         self.m_panel.m_as.numericStats.min) / intervalWidth))
            if intervals > self.m_panel.m_Painter.width():
                intervals = self.m_panel.m_Painter.width() - 6
                if intervals < 1:
                    intervals = 1
            histCounts = [0] * intervals
            barRange = (self.m_panel.m_as.numericStats.max -
                        self.m_panel.m_as.numericStats.min) / len(histCounts)
            self.m_panel.m_maxValue = 0

            for k in range(self.m_panel.m_data.numInstances()):
                if self.m_panel.m_data.instance(k).isMissing(
                        self.m_panel.m_attrIndex):
                    continue
                t = int(
                    math.ceil((self.m_panel.m_data.instance(k).value(
                        self.m_panel.m_attrIndex) -
                               self.m_panel.m_as.numericStats.min) / barRange))
                if t == 0:
                    histCounts[t] += self.m_panel.m_data.instance(k).weight()
                    if histCounts[t] > self.m_panel.m_maxValue:
                        self.m_panel.m_maxValue = histCounts[t]
                else:
                    histCounts[t -
                               1] += self.m_panel.m_data.instance(k).weight()
                    if histCounts[t - 1] > self.m_panel.m_maxValue:
                        self.m_panel.m_maxValue = histCounts[t - 1]
            self.m_panel.m_histBarCounts = histCounts
            self.m_panel.m_barRange = barRange
        self.m_panel.m_threadRun = False
        self.m_panel.m_displayCurrentAttribute = True
        self.m_panel.m_doneCurrentAttribute = True
        self.m_panel.paint()

        self.m_panel.mutex.unlock()
Пример #12
0
    def process(self, toPredict: Instance, classifier: Classifier,
                evaluation: Evaluation):
        probActual = probNext = pred = 0
        classMissing = copy.deepcopy(toPredict)
        classMissing.setDataset(toPredict.dataset())

        if toPredict.classAttribute().isNominal():
            #返回分类预测的概率分布
            preds = classifier.distributionForInstance(classMissing)
            #若概率全部为0,则表示不属于任何一类
            val = 0
            if sum(preds) == 0:
                pred = Utils.missingValue()
                probActual = Utils.missingValue()
            else:
                #分类结果为概率最大的一项下标
                pred = Utils.maxIndex(preds)
                if not Utils.isMissingValue(toPredict.classIndex()):
                    #如果值不缺失,表示非预测样本,不做修改
                    if not Utils.isMissingValue(toPredict.classValue()):
                        val = int(toPredict.classValue())
                    probActual = preds[val]
                else:
                    probActual = preds[Utils.maxIndex(preds)]
            for i in range(toPredict.classAttribute().numValues()):
                if i != val and preds[i] > probNext:
                    probNext = preds[i]
            evaluation.evaluationForSingleInstance(preds, toPredict, True)
        else:
            #单项评估
            pred = evaluation.evaluateModelOnceAndRecordPrediction(
                classifier, toPredict)
        if not self.m_SaveForVisualization:
            return
        #保存可视化数据
        if self.m_PlotInstances is not None:
            isNominal = toPredict.classAttribute().isNominal()
            values = [0] * self.m_PlotInstances.numAttributes()
            i = 0
            while i < self.m_PlotInstances.numAttributes():
                #预测值前的所有值照原来的拷贝
                if i < toPredict.classIndex():
                    values[i] = toPredict.value(i)
                elif i == toPredict.classIndex():
                    if isNominal:
                        #首选结果与备选结果的差值
                        values[i] = probActual - probNext
                        #预测结果
                        values[i + 1] = pred
                        #原始值
                        values[i + 2] = toPredict.value(i)
                        i += 2
                    else:
                        values[i] = pred
                        values[i + 1] = toPredict.value(i)
                        i += 1
                else:
                    if isNominal:
                        values[i] = toPredict.value(i - 2)
                    else:
                        values[i] = toPredict.value(i - 1)
                i += 1
            # print("============")
            # for m in values:
            #     print("val:",m)
            # print("============")
            self.m_PlotInstances.add(Instance(1.0, values))
            if toPredict.classAttribute().isNominal():
                if toPredict.isMissing(
                        toPredict.classIndex()) or Utils.isMissingValue(pred):
                    self.m_PlotShapes.append(Plot2D.MISSING_SHAPE)
                elif pred != toPredict.classValue():
                    self.m_PlotShapes.append(Plot2D.ERROR_SHAPE)
                else:
                    self.m_PlotShapes.append(Plot2D.CONST_AUTOMATIC_SHAPE)
                if self.m_pointSizeProportionalToMargin:
                    self.m_PlotSizes.append(probActual - probNext)
                else:
                    sizeAdj = 0
                    if pred != toPredict.classValue():
                        sizeAdj = 1
                    self.m_PlotSizes.append(Plot2D.DEFAULT_SHAPE_SIZE.value +
                                            sizeAdj)
            else:
                errd = None
                if not toPredict.isMissing(toPredict.classIndex(
                )) and not Utils.isMissingValue(pred):
                    errd = pred - toPredict.classValue()
                    self.m_PlotShapes.append(Plot2D.CONST_AUTOMATIC_SHAPE)
                else:
                    self.m_PlotShapes.append(Plot2D.MISSING_SHAPE)
                self.m_PlotSizes.append(errd)
Пример #13
0
 def isMissingSparse(self, index: int):
     if Utils.isMissingValue(self.valueSparse(index)):
         return True
     return False
Пример #14
0
 def isMissing(self, attrIndex: int):
     return Utils.isMissingValue(self.m_AttValues[attrIndex])