예제 #1
0
    def test_construction(self):
        d = orange.ExampleTable("iris")
        dd = orange.DomainDistributions(d)

        for i in range(4):
            self.assertTrue(isinstance(dd[i], orange.ContDistribution))
            self.assertEqual(id(dd[i]), id(dd[d.domain[i]]))
            self.assertEqual(id(dd[i]), id(dd[d.domain[i].name]))
        self.assertTrue(isinstance(dd[4], orange.DiscDistribution))
        self.assertEqual(id(dd[4]), id(dd[d.domain.classVar]))
        self.assertEqual(id(dd[4]), id(dd["iris"]))
        self.assertEqual(id(dd[4]), id(dd[-1]))

        for i, ddd in enumerate(list(dd)):
            self.assertEqual(id(ddd), id(dd[i]))

        dd = orange.DomainDistributions(d, skip_discrete=True)
        for i in range(4):
            self.assertTrue(isinstance(dd[i], orange.ContDistribution))
        self.assertEqual(dd[-1], None)

        dd = orange.DomainDistributions(d, skip_continuous=True)
        for i in range(4):
            self.assertEqual(dd[i], None)
        self.assertTrue(isinstance(dd[-1], orange.DiscDistribution))
        self.assertEqual(list(dd[-1]), [50, 50, 50])

        dd = orange.DomainDistributions(d,
                                        skip_continuous=True,
                                        skip_discrete=True)
        for i in range(5):
            self.assertEqual(dd[i], None)
예제 #2
0
    def test_pickle(self):
        d = orange.ExampleTable("iris")
        dd = orange.DomainDistributions(d)
        import pickle
        s = pickle.dumps(dd)
        dd2 = pickle.loads(s)

        for i in range(4):
            self.assertTrue(isinstance(dd2[i], orange.ContDistribution))
            self.assertEqual(id(dd2[i]), id(dd2[d.domain[i]]))
            self.assertEqual(id(dd2[i]), id(dd2[d.domain[i].name]))
            self.assertEqual(dd[i], dd2[i])
        self.assertTrue(isinstance(dd2[4], orange.DiscDistribution))
        self.assertEqual(id(dd2[4]), id(dd2[d.domain.classVar]))
        self.assertEqual(id(dd2[4]), id(dd2["iris"]))
        self.assertEqual(id(dd2[4]), id(dd2[-1]))

        dd = orange.DomainDistributions(d, skip_discrete=True)
        s = pickle.dumps(dd)
        dd2 = pickle.loads(s)
        for i in range(4):
            self.assertTrue(isinstance(dd2[i], orange.ContDistribution))
        self.assertEqual(dd2[-1], None)

        dd = orange.DomainDistributions(d, skip_continuous=True)
        s = pickle.dumps(dd)
        dd2 = pickle.loads(s)
        for i in range(4):
            self.assertEqual(dd2[i], None)
        self.assertTrue(isinstance(dd2[-1], orange.DiscDistribution))
        self.assertEqual(list(dd2[-1]), [50, 50, 50])
예제 #3
0
def cforange_hierarchical_clustering_finished(postdata, input_dict,
                                              output_dict):
    import json
    import orange
    matrix = input_dict['dm']
    linkage = int(input_dict['linkage'])
    widget_pk = postdata['widget_id'][0]
    try:
        selected_nodes = json.loads(postdata.get('selected_nodes')[0])
    except:
        raise Exception('Please select a threshold for determining clusters.')
    if isinstance(matrix.items, orange.ExampleTable):
        root = Clustering.hierarchical_clustering(linkage, matrix)
        cluster_ids = set([cluster for _, _, cluster in selected_nodes])
        selected_clusters = set(
            [cluster for _, selected, cluster in selected_nodes if selected])
        clustVar = orange.EnumVariable(
            str('Cluster'),
            values=["Cluster %d" % i for i in cluster_ids] + ["Other"])
        origDomain = matrix.items.domain
        domain = orange.Domain(origDomain.attributes, origDomain.classVar)
        domain.addmeta(orange.newmetaid(), clustVar)
        domain.addmetas(origDomain.getmetas())
        # Build table with selected clusters
        selected_table, unselected_table = orange.ExampleTable(
            domain), orange.ExampleTable(domain)
        for id, selected, cluster in selected_nodes:
            new_ex = orange.Example(domain, matrix.items[id])
            if selected:
                new_ex[clustVar] = clustVar("Cluster %d" % cluster)
                selected_table.append(new_ex)
            else:
                new_ex[clustVar] = clustVar("Other")
                unselected_table.append(new_ex)
        # Build table of centroids
        centroids = orange.ExampleTable(selected_table.domain)
        if len(selected_table) > 0:
            for cluster in sorted(selected_clusters):
                clusterEx = orange.ExampleTable([
                    ex for ex in selected_table
                    if ex[clustVar] == "Cluster %d" % cluster
                ])
                # Attribute statistics
                contstat = orange.DomainBasicAttrStat(clusterEx)
                discstat = orange.DomainDistributions(clusterEx, 0, 0, 1)
                ex = [
                    cs.avg if cs else (ds.modus() if ds else "?")
                    for cs, ds in zip(contstat, discstat)
                ]
                example = orange.Example(centroids.domain, ex)
                example[clustVar] = clustVar("Cluster %d" % cluster)
                centroids.append(example)
    else:  # Attribute distance
        centroids, selected_table, unselected_table = None, None, None
    return {
        'centroids': centroids,
        'selected_examples': selected_table,
        'unselected_examples': unselected_table
    }
예제 #4
0
    def test_equalFreq(self):
        d = orange.ExampleTable("iris")
        for i in range(150):
            d[i, 0] = i
        dd = orange.DomainDiscretization(
            orange.EqualFreqDiscretization(n_intervals=5), d)
        d2 = orange.ExampleTable(dd, d)
        dist = orange.DomainDistributions(d2)
        self.assertEqual(dist[0], [30] * 5)
        self.assertEqual(dd[0].get_value_from.transformer.points,
                         [29.5, 59.5, 89.5, 119.5])

        v2 = orange.EqualFreqDiscretization(n_intervals=5)(d.domain[0], d)
        self.assertEqual(v2.get_value_from.transformer.points,
                         [29.5, 59.5, 89.5, 119.5])

        s = pickle.dumps(dd)
        dd2 = pickle.loads(s)
        self.assertEqual(dd2[0].get_value_from.transformer.points,
                         [29.5, 59.5, 89.5, 119.5])
예제 #5
0
    def commit_data(self):
        items = getattr(self.matrix, "items", None)
        if not items:
            return  # nothing to commit

        self.selectionChanged = False
        self.selectedExamples = None
        selection = self.selected_clusters
        selection = sorted(selection, key=lambda c: c.first)
        maps = [
            list(self.root_cluster.mapping[c.first:c.last]) for c in selection
        ]

        from operator import add
        selected_indices = reduce(add, maps, [])
        unselected_indices = sorted(
            set(self.root_cluster.mapping) - set(selected_indices))

        self.selection = selected = [items[k] for k in selected_indices]
        unselected = [items[k] for k in unselected_indices]

        if not selected:
            self.send("Selected Data", None)
            self.send("Other Data", None)
            self.send("Centroids", None)
            return

        if isinstance(items, ExampleTable):
            c = [i for i in range(len(maps)) for j in maps[i]]
            aid = clustVar = None
            if self.AppendClusters:
                clustVar = orange.EnumVariable(
                    str(self.ClassifyName),
                    values=["Cluster " + str(i)
                            for i in range(len(maps))] + ["Other"])
                origDomain = items.domain
                if self.addIdAs == 0:
                    domain = orange.Domain(origDomain.attributes, clustVar)
                    if origDomain.classVar:
                        domain.addmeta(orange.newmetaid(), origDomain.classVar)
                    aid = -1
                elif self.addIdAs == 1:
                    domain = orange.Domain(origDomain.attributes + [clustVar],
                                           origDomain.classVar)
                    aid = len(origDomain.attributes)
                else:
                    domain = orange.Domain(origDomain.attributes,
                                           origDomain.classVar)
                    aid = orange.newmetaid()
                    domain.addmeta(aid, clustVar)

                domain.addmetas(origDomain.getmetas())
                table1 = table2 = None
                if selected:
                    table1 = orange.ExampleTable(domain, selected)
                    for i in range(len(selected)):
                        table1[i][clustVar] = clustVar("Cluster " + str(c[i]))

                if unselected:
                    table2 = orange.ExampleTable(domain, unselected)
                    for ex in table2:
                        ex[clustVar] = clustVar("Other")

                self.selectedExamples = table1
                self.unselectedExamples = table2
            else:
                self.selectedExamples = orange.ExampleTable(
                    selected) if selected else None
                self.unselectedExamples = orange.ExampleTable(
                    unselected) if unselected else None

            self.send("Selected Data", self.selectedExamples)
            self.send("Other Data", self.unselectedExamples)

            self.centroids = None
            if self.selectedExamples:
                self.centroids = orange.ExampleTable(
                    self.selectedExamples.domain)
                for i in range(len(maps)):
                    clusterEx = [
                        ex for cluster, ex in zip(c, self.selectedExamples)
                        if cluster == i
                    ]
                    clusterEx = orange.ExampleTable(clusterEx)
                    contstat = orange.DomainBasicAttrStat(clusterEx)
                    discstat = orange.DomainDistributions(clusterEx, 0, 0, 1)
                    ex = [
                        cs.avg if cs else (ds.modus() if ds else "?")
                        for cs, ds in zip(contstat, discstat)
                    ]
                    example = orange.Example(self.centroids.domain, ex)
                    if clustVar is not None:
                        example[clustVar] = clustVar(i)
                    self.centroids.append(ex)
            self.send("Centroids", self.centroids)

        elif self.matrixSource == "Data Distance":
            names = list(set([d.strain for d in self.selection]))
            data = [(name, [
                d for d in filter(lambda a: a.strain == name, self.selection)
            ]) for name in names]
            self.send("Structured Data Files", data)
예제 #6
0
# Description: Show frequences for values of discrete attributes, count number of instances where attribute is not defined
# Category:    description
# Uses:        adult_sample.tab
# Referenced:  basic_exploration.htm

import orange
data = orange.ExampleTable("../datasets/adult_sample")
dist = orange.DomainDistributions(data)

print "Average values and mean square errors:"
for i in range(len(data.domain.attributes)):
    if data.domain.attributes[i].varType == orange.VarTypes.Continuous:
        print "%s, mean=%5.2f +- %5.2f" % \
          (data.domain.attributes[i].name, dist[i].average(), dist[i].error())

print "\nFrequencies for values of discrete attributes:"
for i in range(len(data.domain.attributes)):
    a = data.domain.attributes[i]
    if a.varType == orange.VarTypes.Discrete:
        print "%s:" % a.name
        for j in range(len(a.values)):
            print "  %s: %d" % (a.values[j], int(dist[i][j]))

print "\nNumber of instances where attribute is not defined:"
for i in range(len(data.domain.attributes)):
    a = data.domain.attributes[i]
    print "  %2d %s" % (dist[i].unknowns, a.name)
예제 #7
0
    def test_continuizer_zoo(self):
        d = orange.ExampleTable("zoo")
        dd = orange.DomainDistributions(d)
        for i, e in enumerate(dd):
            if i == 2:
                break

        dc = orange.DomainContinuizer()

        dc.multinomial_treatment = dc.MultinomialTreatment.LowestIsBase

        dc.class_treatment = dc.ClassTreatment.ErrorIfCannotHandle
        self.assertRaises(ValueError, dc, d.domain)

        dc.class_treatment = dc.ClassTreatment.LeaveUnlessTarget
        cdomain = dc(d.domain)
        dd = orange.ExampleTable(cdomain, d)
        self.assertEqual(list(map(int, d[0]))[:3], list(map(int, dd[0]))[:3])
        for l in [2, 4, 5, 6, 8]:
            self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4)
        self.assertFalse("legs=0" in cdomain)
        self.assertEqual(cdomain.classVar.name, "type")
        self.assertFalse(cdomain.has_discrete_attributes())
        self.assertFalse(cdomain.has_discrete_attributes(False))
        self.assertTrue(cdomain.has_discrete_attributes(True))

        dc.class_treatment = dc.ClassTreatment.AsOrdinal
        cdomain = dc(d.domain)
        dd = orange.ExampleTable(cdomain, d)
        self.assertEqual(list(map(int, d[0]))[:3], list(map(int, dd[0]))[:3])
        for l in [2, 4, 5, 6, 8]:
            self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4)
        self.assertFalse("legs=0" in cdomain)
        self.assertEqual(cdomain.classVar.name, "C_type")
        self.assertEqual(dd[0, -1], d.domain.class_var.values.index("mammal"))
        self.assertFalse(cdomain.has_discrete_attributes())

        dc.class_treatment = dc.ClassTreatment.AsOrdinal
        cdomain = dc(d)
        dd = orange.ExampleTable(cdomain, d)
        self.assertEqual(list(map(int, d[0]))[:3], list(map(int, dd[0]))[:3])
        for l in [2, 4, 5, 6, 8]:
            self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4)
        self.assertFalse("legs=0" in cdomain)
        self.assertEqual(cdomain.classVar.name, "C_type")
        self.assertEqual(dd[0, -1], d.domain.class_var.values.index("mammal"))
        self.assertFalse(cdomain.has_discrete_attributes())

        dc.multinomial_treatment = dc.MultinomialTreatment.FrequentIsBase
        self.assertRaises(ValueError, dc, d.domain)
        cdomain = dc(d)
        dd = orange.ExampleTable(cdomain, d)
        self.assertEqual(dd[0, 0], 1)
        self.assertEqual(dd[0, 1], 0)
        self.assertEqual(dd[0, 2], 1)

        dc.multinomial_treatment = dc.MultinomialTreatment.FrequentIsBase
        dc.zero_based = False
        self.assertRaises(ValueError, dc, d.domain)
        cdomain = dc(d)
        dd = orange.ExampleTable(cdomain, d)
        self.assertEqual(dd[0, 0], 1)
        self.assertEqual(dd[0, 1], -1)
        self.assertEqual(dd[0, 2], 1)
        dc.zero_based = True

        dc.multinomial_treatment = dc.MultinomialTreatment.NValues
        cdomain = dc(d.domain)
        dd = orange.ExampleTable(cdomain, d)
        for l in [0, 2, 4, 5, 6, 8]:
            self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4)

        dc.multinomial_treatment = dc.MultinomialTreatment.Ignore
        cdomain = dc(d.domain)
        for l in [0, 2, 4, 5, 6, 8]:
            self.assertFalse("legs=%i" in cdomain)

        dc.multinomial_treatment = dc.MultinomialTreatment.IgnoreAllDiscrete
        cdomain = dc(d.domain)
        self.assertEqual(cdomain.variables, [cdomain.class_var])

        dc.multinomial_treatment = dc.MultinomialTreatment.ReportError
        self.assertRaises(ValueError, dc, d.domain)

        dc.multinomial_treatment = dc.MultinomialTreatment.AsOrdinal
        cdomain = dc(d.domain)
        dd = orange.ExampleTable(cdomain, d)
        for e, ec in zip(d[:10], dd):
            self.assertEqual(int(e["legs"]), ec["C_legs"])

        dc.multinomial_treatment = dc.MultinomialTreatment.AsNormalizedOrdinal
        cdomain = dc(d.domain)
        dd = orange.ExampleTable(cdomain, d)
        for e, ec in zip(d[:10], dd):
            self.assertEqual(int(e["legs"]) / 5, ec["C_legs"])