Exemplo n.º 1
0
    def test_continuizer_iris(self):
        d = orange.ExampleTable("iris")
        dc = orange.DomainContinuizer()
        dc.class_treatment = dc.ClassTreatment.LeaveUnlessTarget
        dc.continuous_treatment = dc.ContinuousTreatment.Leave
        cdomain = dc(d.domain)
        self.assertEqual(cdomain.variables, d.domain.variables)

        dc.continuous_treatment = dc.ContinuousTreatment.NormalizeBySpan
        self.assertRaises(ValueError, dc, d.domain)
        cdomain = dc(d)
        dd = orange.ExampleTable(cdomain, d)
        bs = orange.DomainBasicAttrStat(d)
        for e, ec in zip(d[:10], dd):
            for i in range(4):
                self.assertEqual((e[i] - bs[i].min) / (bs[i].max - bs[i].min),
                                 ec[i])

        dc.continuous_treatment = dc.ContinuousTreatment.NormalizeByVariance
        self.assertRaises(ValueError, dc, d.domain)
        cdomain = dc(d)
        dd = orange.ExampleTable(cdomain, d)
        bs = orange.DomainBasicAttrStat(d)
        for e, ec in zip(d[:10], dd):
            for i in range(4):
                self.assertEqual((e[i] - bs[i].avg) / bs[i].dev, ec[i])
Exemplo n.º 2
0
    def __call__(self, attr, data):
        # if the data changed clear the attribute values
        if data != self.data:
            self.attrInfo = {}
            self.data = data

        if self.attrInfo == {}:
            classVar = data.domain.classVar
            datas = [
                data.select({data.domain.classVar.name: [val]})
                for val in data.domain.classVar.values
            ]
            stats = [orange.DomainBasicAttrStat(d) for d in datas]
            cls = range(len(stats))
            clsCount = len(stats)
            for i in range(len(stats[0])):
                if stats[0][i] == None: continue
                temp = 0.0
                for j in cls:
                    for k in range(j + 1, clsCount):
                        if (stats[j][i].dev + stats[k][i].dev) > 0:
                            temp += abs((stats[j][i].avg - stats[k][i].avg) /
                                        (stats[j][i].dev + stats[k][i].dev))
                self.attrInfo[data.domain.attributes[i].name] = temp

        if self.attrInfo.has_key(data.domain[attr].name):
            return self.attrInfo[data.domain[attr].name]
        else:
            return -1
Exemplo n.º 3
0
    def test_pickle(self):
        d = orange.ExampleTable("iris")
        seplen = [float(e[0]) for e in d]

        import pickle
        c = orange.DomainBasicAttrStat(d)
        b = c[0]
        s = pickle.dumps(c)
        c2 = pickle.loads(s)
        self.assertEqual(b.variable, d.domain[0])
        self.assertAlmostEqual(b.min, min(seplen))
        self.assertAlmostEqual(b.max, max(seplen))
        self.assertAlmostEqual(b.avg, sum(seplen) / len(seplen))

        self.assertEqual(id(b), id(c["sepal length"]))
        self.assertEqual(id(b), id(c[d.domain[0]]))

        ll = list(c)
        self.assertEqual(id(b), id(ll[0]))
        self.assertEqual(ll[-1], None)
        self.assertTrue(c.has_class_var)

        self.assertEqual(len(c), 5)
        self.assertEqual(len(ll), 5)

        c.purge()
        self.assertEqual(len(c), 4)
Exemplo n.º 4
0
def cforange_hierarchical_clustering_finished(postdata, input_dict,
                                              output_dict):
    import json
    import orange
    matrix = input_dict['dm']
    linkage = int(input_dict['linkage'])
    widget_pk = postdata['widget_id'][0]
    try:
        selected_nodes = json.loads(postdata.get('selected_nodes')[0])
    except:
        raise Exception('Please select a threshold for determining clusters.')
    if isinstance(matrix.items, orange.ExampleTable):
        root = Clustering.hierarchical_clustering(linkage, matrix)
        cluster_ids = set([cluster for _, _, cluster in selected_nodes])
        selected_clusters = set(
            [cluster for _, selected, cluster in selected_nodes if selected])
        clustVar = orange.EnumVariable(
            str('Cluster'),
            values=["Cluster %d" % i for i in cluster_ids] + ["Other"])
        origDomain = matrix.items.domain
        domain = orange.Domain(origDomain.attributes, origDomain.classVar)
        domain.addmeta(orange.newmetaid(), clustVar)
        domain.addmetas(origDomain.getmetas())
        # Build table with selected clusters
        selected_table, unselected_table = orange.ExampleTable(
            domain), orange.ExampleTable(domain)
        for id, selected, cluster in selected_nodes:
            new_ex = orange.Example(domain, matrix.items[id])
            if selected:
                new_ex[clustVar] = clustVar("Cluster %d" % cluster)
                selected_table.append(new_ex)
            else:
                new_ex[clustVar] = clustVar("Other")
                unselected_table.append(new_ex)
        # Build table of centroids
        centroids = orange.ExampleTable(selected_table.domain)
        if len(selected_table) > 0:
            for cluster in sorted(selected_clusters):
                clusterEx = orange.ExampleTable([
                    ex for ex in selected_table
                    if ex[clustVar] == "Cluster %d" % cluster
                ])
                # Attribute statistics
                contstat = orange.DomainBasicAttrStat(clusterEx)
                discstat = orange.DomainDistributions(clusterEx, 0, 0, 1)
                ex = [
                    cs.avg if cs else (ds.modus() if ds else "?")
                    for cs, ds in zip(contstat, discstat)
                ]
                example = orange.Example(centroids.domain, ex)
                example[clustVar] = clustVar("Cluster %d" % cluster)
                centroids.append(example)
    else:  # Attribute distance
        centroids, selected_table, unselected_table = None, None, None
    return {
        'centroids': centroids,
        'selected_examples': selected_table,
        'unselected_examples': unselected_table
    }
Exemplo n.º 5
0
    def __call__(self, trainingData=None, weight=None, allowMetas=False):
        self.basicStat = None
        if not trainingData:
            print "AZBaseClasses ERROR: Missing training data!"
            return False
        elif dataUtilities.findDuplicatedNames(trainingData.domain):
            print "AZBaseClasses ERROR: Duplicated names found in the training data. Please use the method dataUtilities.DataTable() when loading a dataset in order to fix the duplicated names and avoid this error."
            return False
        elif not trainingData.domain.classVar:
            print "AZBaseClasses ERROR: No class attribute found in training data!"
            return False
        elif not len(trainingData):
            print "AZBaseClasses ERROR: No examples in training data!"
            return False
        elif not len(trainingData.domain.attributes):
            print "AZBaseClasses ERROR: No attributes in training data!"
            return False

        possibleMetas = dataUtilities.getPossibleMetas(trainingData,
                                                       checkIndividuality=True)
        if not allowMetas and possibleMetas:
            msg = "\nAZBaseClasses ERROR: Detected attributes that should be considered meta-attributes:"
            for attr in possibleMetas:
                msg += "\n    " + attr
            raise Exception(msg)
            #return False
        #Get the Domain basic statistics and save only the desired info in self.basicStat
        basicStat = orange.DomainBasicAttrStat(trainingData)
        self.basicStat = {}
        for attr in trainingData.domain:
            if attr.varType in [
                    orange.VarTypes.Discrete, orange.VarTypes.String
            ]:
                self.basicStat[attr.name] = None
            else:
                self.basicStat[attr.name] = {
                    "dev": basicStat[attr].dev,
                    "min": basicStat[attr].min,
                    "max": basicStat[attr].max,
                    "avg": basicStat[attr].avg
                }
        # Gather all the learner parameters to be stored along with the classifier
        # Find the name of the Learner
        learnerName = str(
            self.__class__)[:str(self.__class__).rfind("'")].split(".")[-1]
        self.parameters = {}
        if learnerName != "ConsensusLearner":
            # Load the AZLearnersParamsConfig.py from the AZORANGEHOME!
            AZOLearnersConfig = imp.load_source(
                "AZLearnersParamsConfig",
                os.path.join(os.environ["AZORANGEHOME"], 'azorange',
                             "AZLearnersParamsConfig.py"))
            pars = AZOLearnersConfig.API(learnerName)
            if pars:
                for par in pars.getParameterNames():
                    self.parameters[par] = getattr(self, par)
        return True
Exemplo n.º 6
0
def data_center(data):
    """Return the central - average - point in the data set"""
    atts = data.domain.attributes
    astats = orange.DomainBasicAttrStat(data)
    center = [astats[a].avg if a.varType == orange.VarTypes.Continuous \
              else max(enumerate(orange.Distribution(a, data)), key=lambda x:x[1])[0] if a.varType == orange.VarTypes.Discrete
              else None
              for a in atts]
    if data.domain.classVar:
        center.append(0)
    return orange.Example(data.domain, center)
Exemplo n.º 7
0
def data_center(data):
    """
    Returns a center of the instances in the data set (average across data instances for continuous attributes, most frequent value for discrete attributes).
    """
    atts = data.domain.attributes
    astats = orange.DomainBasicAttrStat(data)
    center = [astats[a].avg if a.varType == orange.VarTypes.Continuous \
#              else max(enumerate(orange.Distribution(a, data)), key=lambda x:x[1])[0] if a.varType == orange.VarTypes.Discrete

              else _modus(orange.Distribution(a, data)) if a.varType == orange.VarTypes.Discrete
              else None
              for a in atts]
    if data.domain.classVar:
        center.append(0)
    return orange.Example(data.domain, center)
Exemplo n.º 8
0
    def test_equalWidth(self):
        d = orange.ExampleTable("iris")
        ba = orange.DomainBasicAttrStat(d)
        ddisc = orange.DomainDiscretization(orange.EqualWidthDiscretization())
        dd = ddisc(d)
        for i in range(4):
            self.assertEqual(len(dd[i].values), 4)
            mi, ma = ba[i].min, ba[i].max
            di = ma - mi
            trans = dd[i].get_value_from.transformer
            self.assertAlmostEqual(trans.first_cut, mi + di / 4, 1)
            self.assertAlmostEqual(trans.step, di / 4, 1)
            self.assertEqual(trans.n_intervals, 4)

        ddisc.discretization.n_intervals = 5

        dd = ddisc(d)
        for i in range(4):
            self.assertEqual(len(dd[i].values), 5)
            mi, ma = ba[i].min, ba[i].max
            di = ma - mi
            trans = dd[i].get_value_from.transformer
            self.assertAlmostEqual(trans.first_cut, mi + di / 5, 1)
            self.assertAlmostEqual(trans.step, di / 5, 1)
            self.assertEqual(trans.n_intervals, 5)
            points = trans.points
            for j in range(4):
                self.assertAlmostEqual(trans.points[i],
                                       trans.first_cut + i * di / 5)

        d2 = orange.ExampleTable(dd, d)
        for e, e2 in zip(d[:5], d2):
            for i in range(4):
                trans = dd[i].get_value_from.transformer
                self.assertEqual(
                    e2[i],
                    math.floor((e[i] - trans.firstCut) / trans.step) + 1)

        s = pickle.dumps(dd)
        dd2 = pickle.loads(s)
        d3 = orange.ExampleTable(dd2, d)
        for e, e2 in zip(d[:5], d3):
            for i in range(4):
                trans = dd[i].get_value_from.transformer
                self.assertEqual(
                    e2[i],
                    math.floor((e[i] - trans.firstCut) / trans.step) + 1)
Exemplo n.º 9
0
    def MeasureAttribute_info(self, attr, data):
        # if basic statistics is not computed for this dataset -> compute it
        if not (self.stats and self.dataset == data):
            self.stats = {}
            self.dataset = data

            arr = [0] * len(data.domain.attributes)
            for val in data.domain.classVar.values:
                data2 = data.select({data.domain.classVar: val})
                bas = orange.DomainBasicAttrStat(data2)
                self.stats[val] = bas

            for i in range(len(self.stats.keys())):
                statI = self.stats[self.stats.keys()[i]]
                if len(statI) == 0: continue
                for j in range(i + 1, len(self.stats.keys())):
                    statJ = self.stats[self.stats.keys()[j]]
                    if len(statJ) == 0: continue
                    for attribute in range(len(data.domain.attributes)):
                        if data.domain.attributes[
                                attribute].varType != orange.VarTypes.Continuous:
                            continue
                        bottom = (statI[attribute].n * statI[attribute].dev +
                                  statJ[attribute].n * statJ[attribute].dev)
                        if bottom == 0.0: bottom = 0.001
                        val = abs(statI[attribute].avg - statJ[attribute].avg
                                  ) * (statI[attribute].n +
                                       statJ[attribute].n) / bottom
                        arr[attribute] += val

            # normalize values in arr so that the largest value will be 1 and others will be proportionally smaller
            largest = max(arr)
            if largest != 0:
                arr = [val / largest for val in arr]

            for i in range(len(data.domain.attributes)):
                self.attrInfo[data.domain.attributes[i].name] = arr[i]

        return self.attrInfo[data.domain[attr].name]
Exemplo n.º 10
0
def tubedRegression(cache, dimensions, progressCallback=None, **args):
    if not cache.findNearest:
        cache.findNearest = orange.FindNearestConstructor_BruteForce(
            cache.data,
            distanceConstructor=orange.ExamplesDistanceConstructor_Euclidean(),
            includeSame=True)

    if not cache.attrStat:
        cache.attrStat = orange.DomainBasicAttrStat(cache.data)

    normalizers = cache.findNearest.distance.normalizers

    if progressCallback:
        nExamples = len(cache.data)
        nPoints = 100.0 / nExamples / len(dimensions)

    effNeighbours = len(cache.contAttributes) > 1 and cache.nNeighbours or len(
        cache.deltas)

    for di, d in enumerate(dimensions):
        contIdx = cache.contIndices[d]

        minV, maxV = cache.attrStat[contIdx].min, cache.attrStat[contIdx].max
        if minV == maxV:
            continue

        oldNormalizer = normalizers[cache.contIndices[d]]
        normalizers[cache.contIndices[d]] = 0

        for exi, ref_example in enumerate(cache.data):
            if ref_example[contIdx].isSpecial():
                cache.deltas[exi][d] = "?"
                continue

            ref_x = float(ref_example[contIdx])

            Sx = Sy = Sxx = Syy = Sxy = n = 0.0

            nn = cache.findNearest(ref_example, 0, True)
            nn = [ex for ex in nn
                  if not ex[contIdx].isSpecial()][:effNeighbours]
            mx = [abs(ex[contIdx] - ref_x) for ex in nn]
            if not mx:
                cache.deltas[exi][d] = "?"
                continue
            if max(mx) < 1e-10:
                kw = math.log(.001)
            else:
                kw = math.log(.001) / max(mx)**2
            for ex in nn[:effNeighbours]:
                ex_x = float(ex[contIdx])
                ex_y = float(ex.getclass())
                w = math.exp(kw * (ex_x - ref_x)**2)
                Sx += w * ex_x
                Sy += w * ex_y
                Sxx += w * ex_x**2
                Syy += w * ex_y**2
                Sxy += w * ex_x * ex_y
                n += w

            div = n * Sxx - Sx**2
            if div:  # and i<40:
                b = (Sxy * n - Sx * Sy) / div

                #                div = Sx*Sy/n - Sxy
                #                if abs(div) < 1e-10:
                #                    cache.errors[exi][d] = 1
                #                else:
                #                    B = ((Syy - Sy**2/n) - (Sxx - Sx**2/n)) / 2 / div
                #
                #                    b_p = -B + math.sqrt(B**2+1)
                #                    a = Sy/n - b_p * Sx/n
                #                    error1 = 1/(1+b_p**2) * (Syy + a**2 + b_p**2*Sxx - 2*a*Sy + 2*a*b_p*Sx - 2*b_p*Sxy)
                #
                #                    b_2 = -B - math.sqrt(B**2+1)
                #                    a = Sy/n - b_p * Sx/n
                #                    error2 = 1/(1+b_p**2) * (Syy + a**2 + b_p**2*Sxx - 2*a*Sy + 2*a*b_p*Sx - 2*b_p*Sxy)
                #
                #                    if error1 < error2 and error1 >= 0:
                #                        cache.errors[exi][d] = error1
                #                    elif error2 >= 0:
                #                        cache.errors[exi][d] = error2
                #                    else:
                #                        cache.errors[exi][d] = 42
                #                        print error1, error2

                a = (Sy - b * Sx) / n
                err = (n * a**2 + b**2 * Sxx + Syy + 2 * a * b * Sx -
                       2 * a * Sy - 2 * b * Sxy)
                tot = Syy - Sy**2 / n
                mod = tot - err
                merr = err / (n - 2)
                if merr < 1e-10:
                    F = 0
                    Fprob = 1
                else:
                    F = mod / merr
                    Fprob = statc.fprob(F, 1, int(n - 2))
                cache.errors[exi][d] = Fprob
                #                        print "%.4f" % Fprob,
                #print ("%.3f\t" + "%.0f\t"*6 + "%f\t%f") % (w, ref_x, ex_x, n, a, b, merr, F, Fprob)
                cache.deltas[exi][d] = b
            else:
                cache.deltas[exi][d] = "?"

            if progressCallback:
                progressCallback((nExamples * di + exi) * nPoints)

        normalizers[cache.contIndices[d]] = oldNormalizer
Exemplo n.º 11
0
    def commit_data(self):
        items = getattr(self.matrix, "items", None)
        if not items:
            return  # nothing to commit

        self.selectionChanged = False
        self.selectedExamples = None
        selection = self.selected_clusters
        selection = sorted(selection, key=lambda c: c.first)
        maps = [
            list(self.root_cluster.mapping[c.first:c.last]) for c in selection
        ]

        from operator import add
        selected_indices = reduce(add, maps, [])
        unselected_indices = sorted(
            set(self.root_cluster.mapping) - set(selected_indices))

        self.selection = selected = [items[k] for k in selected_indices]
        unselected = [items[k] for k in unselected_indices]

        if not selected:
            self.send("Selected Data", None)
            self.send("Other Data", None)
            self.send("Centroids", None)
            return

        if isinstance(items, ExampleTable):
            c = [i for i in range(len(maps)) for j in maps[i]]
            aid = clustVar = None
            if self.AppendClusters:
                clustVar = orange.EnumVariable(
                    str(self.ClassifyName),
                    values=["Cluster " + str(i)
                            for i in range(len(maps))] + ["Other"])
                origDomain = items.domain
                if self.addIdAs == 0:
                    domain = orange.Domain(origDomain.attributes, clustVar)
                    if origDomain.classVar:
                        domain.addmeta(orange.newmetaid(), origDomain.classVar)
                    aid = -1
                elif self.addIdAs == 1:
                    domain = orange.Domain(origDomain.attributes + [clustVar],
                                           origDomain.classVar)
                    aid = len(origDomain.attributes)
                else:
                    domain = orange.Domain(origDomain.attributes,
                                           origDomain.classVar)
                    aid = orange.newmetaid()
                    domain.addmeta(aid, clustVar)

                domain.addmetas(origDomain.getmetas())
                table1 = table2 = None
                if selected:
                    table1 = orange.ExampleTable(domain, selected)
                    for i in range(len(selected)):
                        table1[i][clustVar] = clustVar("Cluster " + str(c[i]))

                if unselected:
                    table2 = orange.ExampleTable(domain, unselected)
                    for ex in table2:
                        ex[clustVar] = clustVar("Other")

                self.selectedExamples = table1
                self.unselectedExamples = table2
            else:
                self.selectedExamples = orange.ExampleTable(
                    selected) if selected else None
                self.unselectedExamples = orange.ExampleTable(
                    unselected) if unselected else None

            self.send("Selected Data", self.selectedExamples)
            self.send("Other Data", self.unselectedExamples)

            self.centroids = None
            if self.selectedExamples:
                self.centroids = orange.ExampleTable(
                    self.selectedExamples.domain)
                for i in range(len(maps)):
                    clusterEx = [
                        ex for cluster, ex in zip(c, self.selectedExamples)
                        if cluster == i
                    ]
                    clusterEx = orange.ExampleTable(clusterEx)
                    contstat = orange.DomainBasicAttrStat(clusterEx)
                    discstat = orange.DomainDistributions(clusterEx, 0, 0, 1)
                    ex = [
                        cs.avg if cs else (ds.modus() if ds else "?")
                        for cs, ds in zip(contstat, discstat)
                    ]
                    example = orange.Example(self.centroids.domain, ex)
                    if clustVar is not None:
                        example[clustVar] = clustVar(i)
                    self.centroids.append(ex)
            self.send("Centroids", self.centroids)

        elif self.matrixSource == "Data Distance":
            names = list(set([d.strain for d in self.selection]))
            data = [(name, [
                d for d in filter(lambda a: a.strain == name, self.selection)
            ]) for name in names]
            self.send("Structured Data Files", data)
Exemplo n.º 12
0
# Description: Shows how to compute and print out the basic attribute statistics
# Category:    statistics
# Classes:     DomainBasicAttrStat, BasicAttrStat
# Uses:        iris
# Referenced:  basicstat.htm

import orange
data = orange.ExampleTable("iris")
bas = orange.DomainBasicAttrStat(data)

print "%20s  %5s  %5s  %5s" % ("attribute", "min", "max", "avg")
for a in bas:
    if a:
        print "%20s  %5.3f  %5.3f  %5.3f" % (a.variable.name, a.min, a.max,
                                             a.avg)

print bas["sepal length"].avg
Exemplo n.º 13
0
    def __call__(self, attribute, data):

        # if the data changed clear the attribute values
        if data != self.dataMix:
            self.attrInfoMix = {}
            self.attrInfo = {}
            self.dataMix = data

        if self.attrInfoMix == {}:
            attrs = range(len(data.domain.attributes))
            classVar = data.domain.classVar
            #shortData = data.select(attrs + [classVar])
            datas = [
                data.select({classVar.name: [val]}) for val in classVar.values
            ]
            statistics = [orange.DomainBasicAttrStat(d) for d in datas]

            cls = []
            for classVarIndex, c in enumerate(
                    classVar.values
            ):  # for each class value compute how good is each attribute for discriminating this class value against all other
                attrValsList = []
                newData = mergeClassValues(data, c)
                for attrIndex in range(len(attrs)):
                    if data.domain[
                            attrIndex].varType == orange.VarTypes.Discrete:  # ignore discrete attributes
                        continue
                    val = S2NMeasure.__call__(self, attrs[attrIndex], newData)
                    if statistics[0][attrIndex] == None:
                        attrValsList.append((0, attrs[attrIndex]))
                    else:
                        aves = [stat[attrIndex].avg for stat in statistics]
                        if max(aves) != aves[classVarIndex]:
                            val = -val
                        attrValsList.append((val, attrs[attrIndex]))
                attrValsList.sort()
                attrValsList = [element[1] for element in attrValsList
                                ]  # remove the value
                attrValsList.reverse()
                cls.append(attrValsList)

            attrPositionsDict = dict([(attr, []) for attr in cls[0]])
            for arr in cls:
                for i in range(len(arr)):
                    attrPositionsDict[arr[i]].append(i)

            numClasses = len(classVar.values)
            currPos = [0 for i in range(numClasses)]
            self.sortedAttrList = []
            ableToAdd = 1
            while ableToAdd:  # sometimes some attributes are duplicated. in such cases we will add only one instance of such attribute to the list
                ableToAdd = 0
                for i in range(numClasses):
                    pos = currPos[i]
                    while pos < len(cls[i]) and cls[i][pos] == None:
                        pos += 1
                    currPos[i] = pos + 1
                    if pos >= len(cls[i]):
                        continue
                    ableToAdd = 1

                    attr = cls[i][pos]
                    self.sortedAttrList.append(attr)
                    attrPositions = attrPositionsDict[
                        attr]  # get indices in cls where attribute attr is placed
                    for j in range(numClasses):
                        cls[j][attrPositions[j]] = None

            count = len(self.sortedAttrList)
            for (i, attr) in enumerate(self.sortedAttrList):
                self.attrInfoMix[data.domain[attr].name] = count - i

        if self.attrInfoMix.has_key(data.domain[attribute].name):
            return self.attrInfoMix[data.domain[attribute].name]
        else:
            return -1