示例#1
0
    def __call__(self, examples):
        examples = createFullNoDiscTable(examples)
        classifiers = [
            LogRegLearner(
                orange.Preprocessor_dropMissing(
                    examples.select(orange.Domain(x,
                                                  examples.domain.classVar))))
            for x in examples.domain.attributes
        ]
        maj_classifier = LogRegLearner(
            orange.Preprocessor_dropMissing(
                examples.select(orange.Domain(examples.domain.classVar))))
        beta = [maj_classifier.beta[0]] + [x.beta[1] for x in classifiers]
        beta_se = [maj_classifier.beta_se[0]
                   ] + [x.beta_se[1] for x in classifiers]
        P = [maj_classifier.P[0]] + [x.P[1] for x in classifiers]
        wald_Z = [maj_classifier.wald_Z[0]
                  ] + [x.wald_Z[1] for x in classifiers]
        domain = examples.domain

        return Univariate_LogRegClassifier(beta=beta,
                                           beta_se=beta_se,
                                           P=P,
                                           wald_Z=wald_Z,
                                           domain=domain)
示例#2
0
    def __call__(self, examples, weight=0):
        imputer = getattr(self, "imputer", None) or None
        if getattr(self, "removeMissing", 0):
            examples = orange.Preprocessor_dropMissing(examples)


##        if hasDiscreteValues(examples.domain):
##            examples = createNoDiscTable(examples)
        if not len(examples):
            return None
        if getattr(self, "stepwiseLR", 0):
            addCrit = getattr(self, "addCrit", 0.2)
            removeCrit = getattr(self, "removeCrit", 0.3)
            numAttr = getattr(self, "numAttr", -1)
            attributes = StepWiseFSS(examples,
                                     addCrit=addCrit,
                                     deleteCrit=removeCrit,
                                     imputer=imputer,
                                     numAttr=numAttr)
            tmpDomain = orange.Domain(attributes, examples.domain.classVar)
            tmpDomain.addmetas(examples.domain.getmetas())
            examples = examples.select(tmpDomain)
        learner = orange.LogRegLearner()
        learner.imputerConstructor = imputer
        if imputer:
            examples = self.imputer(examples)(examples)
        examples = orange.Preprocessor_dropMissing(examples)
        if self.fitter:
            learner.fitter = self.fitter
        if self.removeSingular:
            lr = learner.fitModel(examples, weight)
        else:
            lr = learner(examples, weight)
        while isinstance(lr, orange.Variable):
            if isinstance(lr.getValueFrom,
                          orange.ClassifierFromVar) and isinstance(
                              lr.getValueFrom.transformer,
                              orange.Discrete2Continuous):
                lr = lr.getValueFrom.variable
            attributes = examples.domain.attributes[:]
            if lr in attributes:
                attributes.remove(lr)
            else:
                attributes.remove(lr.getValueFrom.variable)
            newDomain = orange.Domain(attributes, examples.domain.classVar)
            newDomain.addmetas(examples.domain.getmetas())
            examples = examples.select(newDomain)
            lr = learner.fitModel(examples, weight)
        return lr
示例#3
0
 def getImputer(self):
     if self.methodInd in [0, 1, 2]:
         learner = self.IMPUTERS[self.methodInd][1]()
         imputer = Preprocessor_imputeByLearner(learner=learner)
     elif self.methodInd == 3:
         imputer = orange.Preprocessor_dropMissing()
     return imputer
示例#4
0
    def test_dropMissingValues(self):
        self.data += [[0, 0, 0], [0, 0, "?"], [0, 1, 0], [1, 0, 1],
                      [0, "?", 0], [1, 0, 1]]

        data2 = orange.Preprocessor_dropMissing(self.data)
        self.assertEqual(len(data2), 4)
        self.assertIs(self.data, data2.base)
示例#5
0
    def MeasureAttribute_info(self, attr, data):
        table = data.select([attr, data.domain.classVar])
        table = orange.Preprocessor_dropMissing(table)
        a1 = [table[k][0].value for k in range(len(table))]
        a2 = [table[k][1].value for k in range(len(table))]

        val, prob = statc.pearsonr(a1, a2)
        return val
示例#6
0
def computeCorrelation(data, attr1, attr2):
    if data.domain[attr1].varType != orange.VarTypes.Continuous: return None
    if data.domain[attr2].varType != orange.VarTypes.Continuous: return None

    table = data.select([attr1, attr2])
    table = orange.Preprocessor_dropMissing(table)
    a1 = [table[k][attr1].value for k in range(len(table))]
    a2 = [table[k][attr2].value for k in range(len(table))]

    try:
        val, prob = statc.pearsonr(a1, a2)
    except:
        val = 0.0  # possibly invalid a1 or a2

    return val
示例#7
0
def computeCorrelationInsideClasses(data, attr1, attr2):
    if data.domain[attr1].varType != orange.VarTypes.Continuous: return None
    if data.domain[attr2].varType != orange.VarTypes.Continuous: return None

    table = data.select([attr1, attr2, data.domain.classVar])
    table = orange.Preprocessor_dropMissing(table)
    lengths = []
    corrs = []
    for val in table.domain.classVar.values:
        tab = table.filter({table.domain.classVar: val})
        a1 = [tab[k][attr1].value for k in range(len(tab))]
        a2 = [tab[k][attr2].value for k in range(len(tab))]
        if len(a1) == 0: continue
        val, prob = statc.pearsonr(a1, a2)
        lengths.append(len(a1))
        corrs.append(val)
    corr = 0
    for ind in range(len(corrs)):
        corr += abs(corrs[ind]) * lengths[ind]
    corr /= sum(lengths)
    return corr, corrs, lengths
示例#8
0
    def __call__(self, examples):
        if getattr(self, "imputer", 0):
            examples = self.imputer(examples)(examples)
        if getattr(self, "removeMissing", 0):
            examples = orange.Preprocessor_dropMissing(examples)
        continuizer = orange.DomainContinuizer(
            zeroBased=1,
            continuousTreatment=orange.DomainContinuizer.Leave,
            multinomialTreatment=orange.DomainContinuizer.FrequentIsBase,
            classTreatment=orange.DomainContinuizer.Ignore)
        attr = []
        remain_attr = examples.domain.attributes[:]

        # get LL for Majority Learner
        tempDomain = orange.Domain(attr, examples.domain.classVar)
        #tempData  = orange.Preprocessor_dropMissing(examples.select(tempDomain))
        tempData = orange.Preprocessor_dropMissing(examples.select(tempDomain))

        ll_Old = getLikelihood(orange.LogRegFitter_Cholesky(), tempData)
        ll_Best = -1000000
        length_Old = float(len(tempData))

        stop = 0
        while not stop:
            # LOOP until all variables are added or no further deletion nor addition of attribute is possible
            worstAt = None
            # if there are more than 1 attribute then perform backward elimination
            if len(attr) >= 2:
                minG = 1000
                worstAt = attr[0]
                ll_Best = ll_Old
                length_Best = length_Old
                for at in attr:
                    # check all attribute whether its presence enough increases LL?

                    tempAttr = filter(lambda x: x != at, attr)
                    tempDomain = orange.Domain(tempAttr,
                                               examples.domain.classVar)
                    tempDomain.addmetas(examples.domain.getmetas())
                    # domain, calculate P for LL improvement.
                    tempDomain = continuizer(
                        orange.Preprocessor_dropMissing(
                            examples.select(tempDomain)))
                    tempData = orange.Preprocessor_dropMissing(
                        examples.select(tempDomain))

                    ll_Delete = getLikelihood(orange.LogRegFitter_Cholesky(),
                                              tempData)
                    length_Delete = float(len(tempData))
                    length_Avg = (length_Delete + length_Old) / 2.0

                    G = -2 * length_Avg * (ll_Delete / length_Delete -
                                           ll_Old / length_Old)

                    # set new worst attribute
                    if G < minG:
                        worstAt = at
                        minG = G
                        ll_Best = ll_Delete
                        length_Best = length_Delete
                # deletion of attribute

                if worstAt.varType == orange.VarTypes.Continuous:
                    P = lchisqprob(minG, 1)
                else:
                    P = lchisqprob(minG,
                                   len(worstAt.values) - 1)
                if P >= self.deleteCrit:
                    attr.remove(worstAt)
                    remain_attr.append(worstAt)
                    nodeletion = 0
                    ll_Old = ll_Best
                    length_Old = length_Best
                else:
                    nodeletion = 1
            else:
                nodeletion = 1
                # END OF DELETION PART

            # if enough attributes has been chosen, stop the procedure
            if self.numAttr > -1 and len(attr) >= self.numAttr:
                remain_attr = []

            # for each attribute in the remaining
            maxG = -1
            ll_Best = ll_Old
            length_Best = length_Old
            bestAt = None
            for at in remain_attr:
                tempAttr = attr + [at]
                tempDomain = orange.Domain(tempAttr, examples.domain.classVar)
                tempDomain.addmetas(examples.domain.getmetas())
                # domain, calculate P for LL improvement.
                tempDomain = continuizer(
                    orange.Preprocessor_dropMissing(
                        examples.select(tempDomain)))
                tempData = orange.Preprocessor_dropMissing(
                    examples.select(tempDomain))
                ll_New = getLikelihood(orange.LogRegFitter_Cholesky(),
                                       tempData)

                length_New = float(
                    len(tempData)
                )  # get number of examples in tempData to normalize likelihood

                # P=PR(CHI^2>G), G=-2(L(0)-L(1))=2(E(0)-E(1))
                length_avg = (length_New + length_Old) / 2
                G = -2 * length_avg * (ll_Old / length_Old -
                                       ll_New / length_New)
                if G > maxG:
                    bestAt = at
                    maxG = G
                    ll_Best = ll_New
                    length_Best = length_New
            if not bestAt:
                stop = 1
                continue

            if bestAt.varType == orange.VarTypes.Continuous:
                P = lchisqprob(maxG, 1)
            else:
                P = lchisqprob(maxG,
                               len(bestAt.values) - 1)
            # Add attribute with smallest P to attributes(attr)
            if P <= self.addCrit:
                attr.append(bestAt)
                remain_attr.remove(bestAt)
                ll_Old = ll_Best
                length_Old = length_Best

            if (P > self.addCrit and nodeletion) or (bestAt == worstAt):
                stop = 1

        return attr
示例#9
0
data2 = pp(data)

print "Removing 50% of class values:",
for ex in data2:
    print ex.getclass(),
print

data2 = orange.Preprocessor_dropMissingClasses(data2)
print "Removing examples with unknown class values:",
for ex in data2:
    print ex.getclass(),
print

print "\n\nRemoving 20% of values of 'age' and 50% of astigmatism:"
pp = orange.Preprocessor_addMissing()
pp.proportions = {age: 0.2, astigm: 0.5}
pp.specialType = orange.ValueTypes.DC
data2 = pp(data)
for ex in data2:
    print ex

print "\n\nRemoving examples with unknown values"
data3 = orange.Preprocessor_dropMissing(data2)
for ex in data3:
    print ex

print "\n\nSelecting examples with unknown values"
data3 = orange.Preprocessor_takeMissing(data2)
for ex in data3:
    print ex
示例#10
0
def getFunctionalList(data):
    import orngCI

    bestQual = -10000000
    bestAttr = -1
    testAttrs = []

    dataShort = orange.Preprocessor_dropMissing(data)
    # remove continuous attributes from data
    disc = []
    for i in range(len(dataShort.domain.attributes)):
        # keep only discrete attributes that have more than one value
        if dataShort.domain.attributes[
                i].varType == orange.VarTypes.Discrete and len(
                    dataShort.domain.attributes[i].values) > 1:
            disc.append(dataShort.domain.attributes[i].name)
    if disc == []: return []
    discData = dataShort.select(disc + [dataShort.domain.classVar.name])

    remover = orngCI.AttributeRedundanciesRemover(noMinimization=1)
    newData = remover(discData, weight=0)

    for attr in newData.domain.attributes:
        testAttrs.append(attr.name)

    # compute the best attribute combination
    for i in range(len(newData.domain.attributes)):
        vals, qual = orngCI.FeatureByMinComplexity(
            newData, [newData.domain.attributes[i], newData.domain.classVar])
        if qual > bestQual:
            bestQual = qual
            bestAttr = newData.domain.attributes[i].name
            mergedVals = vals
            mergedVals.name = newData.domain.classVar.name

    if bestAttr == -1: return []
    outList = [bestAttr]
    newData = replaceAttributes(bestAttr, newData.domain.classVar, mergedVals,
                                newData)
    testAttrs.remove(bestAttr)

    while (testAttrs != []):
        bestQual = -10000000
        for attrName in testAttrs:
            vals, qual = orngCI.FeatureByMinComplexity(newData,
                                                       [mergedVals, attrName])
            if qual > bestQual:
                bestqual = qual
                bestAttr = attrName

        vals, qual = orngCI.FeatureByMinComplexity(newData,
                                                   [mergedVals, bestAttr])
        mergedVals = vals
        mergedVals.name = newData.domain.classVar.name
        newData = replaceAttributes(bestAttr, newData.domain.classVar,
                                    mergedVals, newData)
        outList.append(bestAttr)
        testAttrs.remove(bestAttr)

    # new attributes have "'" at the end of their names. we have to remove that in ored to identify them in the old domain
    for index in range(len(outList)):
        if outList[index][-1] == "'": outList[index] = outList[index][:-1]
    return outList