def test_rnd_sampling(data, learners, p=0.9, n=30, x=1):
    acc = [0.0]*len(learners)
    #sens = [0.0]*len(learners)
    #spec = [0.0]*len(learners)
    tpos = [0.0]*len(learners)
    tneg = [0.0]*len(learners)
    fpos = [0.0]*len(learners)
    fneg = [0.0]*len(learners)
    for i in range(n):
        newselection = orange.MakeRandomIndices2(data, 0.1, randseed=(datetime.datetime.now().time().microsecond + i))
        selection = orange.MakeRandomIndices2(data, p, randseed=(datetime.datetime.now().time().microsecond-i))
        train_data = data.select(selection, 0)#this selects selection
        test_data = data.select(newselection, 0)
        classifiers = []
        for l in learners:
            classifiers.append(l(train_data))
        acc1 = accuracy(test_data, classifiers)[0]
        #sens1 = accuracy(test_data, classifiers)[1]
        #spec1 = accuracy(test_data, classifiers)[2]
        tpos1 = accuracy(test_data, classifiers)[1]
        tneg1 = accuracy(test_data, classifiers)[2]
        fpos1 = accuracy(test_data, classifiers)[3]
        fneg1 = accuracy(test_data, classifiers)[4]
        #print "%d: %s" % (i+1, acc1)
        for j in range(len(learners)):
            acc[j] += acc1[j]
        #for j in range(len(learners)):
         #   sens[j] += sens1[j]
        #for j in range(len(learners)):
         #   spec[j] += spec1[j]
        for j in range(len(learners)):
            tpos[j] += tpos1[j]
        for j in range(len(learners)):
            tneg[j] += tneg1[j]
        for j in range(len(learners)):
            fpos[j] += fpos1[j]
        for j in range(len(learners)):
            fneg[j] += fneg1[j]
    for j in range(len(learners)):
        acc[j] = acc[j]/n
    #for j in range(len(learners)):
     #   sens[j] = sens[j]/n
    #for j in range(len(learners)):
     #   spec[j] = spec[j]/n
    for j in range(len(learners)):
        tpos[j] = tpos[j]/n
    for j in range(len(learners)):
        tneg[j] = tneg[j]/n
    for j in range(len(learners)):
        fpos[j] = fpos[j]/n
    for j in range(len(learners)):
        fneg[j] = fneg[j]/n
    return (acc, tpos, tneg, fpos, fneg)
Пример #2
0
    def test_distance_on(self, dataset):
        import numpy
        indices = orange.MakeRandomIndices2(dataset, min(20, len(dataset)))
        dataset = dataset.select(indices, 0)
        with member_set(self.distance_constructor, "ignore_class", True):
            mat = distance_matrix(dataset, self.distance_constructor)

        self.assertIsInstance(mat, Orange.misc.SymMatrix)
        self.assertEqual(mat.dim, len(dataset))

        m = numpy.array(list(mat))
        self.assertTrue((m >= 0.0).all())

        if dataset.domain.class_var:
            with member_set(self.distance_constructor, "ignore_class", False):
                try:
                    mat = distance_matrix(dataset, self.distance_constructor)
                except orange.KernelException, ex:
                    if "not supported" in str(ex):
                        return
                    else:
                        raise
            m1 = numpy.array(list(mat))
            self.assertTrue(
                (m1 != m).all() or dataset,
                "%r does not seem to respect the 'ignore_class' flag")
Пример #3
0
def learningCurveWithTestData(
        learners,
        learnset,
        testset,
        times=10,
        proportions=orange.frange(0.1),
        strat=orange.MakeRandomIndices.StratifiedIfPossible,
        pps=[],
        **argkw):
    verb = argkw.get("verbose", 0)

    learnset, learnweight = demangleExamples(learnset)
    testweight = demangleExamples(testset)[1]

    randomGenerator = argkw.get("indicesrandseed", 0) or argkw.get(
        "randseed", 0) or argkw.get("randomGenerator", 0)
    pick = orange.MakeRandomIndices2(stratified=strat,
                                     randomGenerator=randomGenerator)
    allResults = []
    for p in proportions:
        printVerbose("Proportion: %5.3f" % p, verb)
        testResults = ExperimentResults(
            times, [l.name for l in learners],
            testset.domain.classVar.values.native(), testweight != 0,
            testset.domain.classVar.baseValue)
        testResults.results = []

        for t in range(times):
            printVerbose("  repetition %d" % t, verb)
            learnAndTestOnTestData(learners, (learnset.selectref(
                pick(learnset, p), 0), learnweight), testset, testResults, t)

        allResults.append(testResults)

    return allResults
Пример #4
0
    def test_pickling_on(self, dataset):
        """ Test learner and classifier pickling.
        """
        classifier = self.learner(dataset)

        s = pickle.dumps(classifier)
        classifier_clone = pickle.loads(s)

        indices = orange.MakeRandomIndices2(p0=20)(dataset)
        test = dataset.select(indices, 0)

        for ex in test:
            if isinstance(dataset.domain.class_var, Orange.feature.Continuous):
                # Test to third digit after the decimal point
                self.assertAlmostEqual(
                    classifier(ex, orange.GetValue).native(),
                    classifier_clone(ex, orange.GetValue).native(),
                    min(3, dataset.domain.class_var.number_of_decimals),
                    "Pickled and original classifier return a different value!"
                )
            else:
                self.assertEqual(
                    classifier(ex, orange.GetValue),
                    classifier_clone(ex, orange.GetValue),
                    "Pickled and original classifier return a different value!"
                )
Пример #5
0
def main():
    print "loading"
    annotations = annotation_reader.from_file("%s/data/directions/breadbox/nouns_stefie10.txt" % TKLIB_HOME)
    table = annotations.as_orange_table()
    cv_indices = orange.MakeRandomIndices2(table, p0=0.5)
    print "indices", set(cv_indices)
    print "splitting"
    training, testing = annotation_reader.split(annotations, cv_indices)
    print "features"

    engine = PairwiseEngine(training)
    
    training_table = engine.training_table
    testing_table = engine.makeTable(testing)
    print len(training_table), "training"
    print len(testing_table), "testing"
    
    learners = [orange.MajorityLearner(),
                orngEnsemble.RandomForestLearner(),
                ]
    results = orngTest.learnAndTestOnTestData(learners, 
                                              training_table, testing_table)

    for accuracy, cm in zip(orngStat.CA(results),
                            orngStat.confusionMatrices(results)):
        print orangeUtils.confusion_matrix_to_string(table.domain, cm)
        print "accuracy: %.2f%%" % (accuracy*100)
Пример #6
0
def proportionTest(learners,
                   examples,
                   learnProp,
                   times=10,
                   strat=orange.MakeRandomIndices.StratifiedIfPossible,
                   pps=[],
                   callback=None,
                   **argkw):
    """train-and-test evaluation (train on a subset, test on remaing examples)"""
    # randomGenerator is set either to what users provided or to orange.RandomGenerator(0)
    # If we left it None or if we set MakeRandomIndices2.randseed, it would give same indices each time it's called
    randomGenerator = argkw.get("indicesrandseed", 0) or argkw.get(
        "randseed", 0) or argkw.get("randomGenerator", 0)
    pick = orange.MakeRandomIndices2(stratified=strat,
                                     p0=learnProp,
                                     randomGenerator=randomGenerator)

    examples, weight = demangleExamples(examples)
    classVar = examples.domain.classVar
    if classVar.varType == orange.VarTypes.Discrete:
        values = list(classVar.values)
        baseValue = classVar.baseValue
    else:
        baseValue = values = None
    testResults = ExperimentResults(times, [l.name for l in learners], values,
                                    weight != 0, baseValue)

    for time in range(times):
        indices = pick(examples)
        learnset = examples.selectref(indices, 0)
        testset = examples.selectref(indices, 1)
        learnAndTestOnTestData(learners, (learnset, weight), (testset, weight),
                               testResults, time, pps, **argkw)
        if callback: callback()
    return testResults
Пример #7
0
def learningCurveN(learners,
                   examples,
                   folds=10,
                   strat=orange.MakeRandomIndices.StratifiedIfPossible,
                   proportions=orange.frange(0.1),
                   pps=[],
                   **argkw):
    """construct a learning curve for learners"""
    seed = argkw.get("indicesrandseed", -1) or argkw.get("randseed", -1)
    if seed:
        randomGenerator = orange.RandomGenerator(seed)
    else:
        randomGenerator = argkw.get("randomGenerator",
                                    orange.RandomGenerator())

    if strat:
        cv = orange.MakeRandomIndicesCV(folds=folds,
                                        stratified=strat,
                                        randomGenerator=randomGenerator)
        pick = orange.MakeRandomIndices2(stratified=strat,
                                         randomGenerator=randomGenerator)
    else:
        cv = orange.RandomIndicesCV(folds=folds,
                                    stratified=strat,
                                    randomGenerator=randomGenerator)
        pick = orange.RandomIndices2(stratified=strat,
                                     randomGenerator=randomGenerator)
    return learningCurve(*(learners, examples, cv, pick, proportions, pps),
                         **argkw)
Пример #8
0
def test_rnd_sampling(data, learners, p=0.9, n=30):
    acc = [0.0] * len(learners)
    for i in range(n):
        newselection = orange.MakeRandomIndices2(data, 0.1, randseed=i + 10)
        selection = orange.MakeRandomIndices2(data, p, randseed=i)
        train_data = data.select(selection, 0)  #this selects selection
        test_data = data.select(newselection, 0)
        classifiers = []
        for l in learners:
            classifiers.append(l(train_data))
        acc1 = accuracy(test_data, classifiers)
        print "%d: %s" % (i + 1, acc1)
        for j in range(len(learners)):
            acc[j] += acc1[j]
    for j in range(len(learners)):
        acc[j] = acc[j] / n
    return acc
Пример #9
0
def partition_data(data, percent_train=0.5):
    indx = orange.MakeRandomIndices2(p0=percent_train)
    train_indices = indx(data)

    train = data.select(train_indices)
    test = data.select(train_indices, negate=True)

    return (train, test)
Пример #10
0
def cforange_split_dataset(input_dict):
    import orange
    output_dict = {}
    data = input_dict['dataset']
    selection = orange.MakeRandomIndices2(data,float(input_dict['p']))
    train_data = data.select(selection,0)
    test_data = data.select(selection,1)
    output_dict['train_data']=train_data
    output_dict['test_data']=test_data
    return output_dict
Пример #11
0
 def FindSmilesAttr(self, data):
     data=data.select(orange.MakeRandomIndices2(data, min(20, len(data))))
     stringVars=filter(lambda var:type(var)==orange.StringVariable, data.domain.attributes+data.domain.getmetas().values())
     count=dict.fromkeys(stringVars, 0)
     for example in data:
         for var in stringVars:
             if LoadMolFromSmiles(str(example[var])):
                 count[var]+=1
     count=count.items()
     count.sort(lambda a,b:cmp(a[1], b[1]))
     return count[-1][0]
Пример #12
0
def test_rnd_sampling(data, learners, p=0.9, n=30):
    acc = [0.0] * len(learners)
    sens = [0.0] * len(learners)
    spec = [0.0] * len(learners)
    fpos = [0.0] * len(learners)
    fneg = [0.0] * len(learners)
    for i in range(n):
        newselection = orange.MakeRandomIndices2(data, 0.1, randseed=i + 10)
        selection = orange.MakeRandomIndices2(data, p, randseed=i)
        train_data = data.select(selection, 0)  #this selects selection
        test_data = data.select(newselection, 0)
        classifiers = []
        for l in learners:
            classifiers.append(l(train_data))
        acc1 = accuracy(test_data, classifiers)[0]
        sens1 = accuracy(test_data, classifiers)[1]
        spec1 = accuracy(test_data, classifiers)[2]
        fpos1 = accuracy(test_data, classifiers)[3]
        fneg1 = accuracy(test_data, classifiers)[4]
        print "%d: %s" % (i + 1, acc1)
        for j in range(len(learners)):
            acc[j] += acc1[j]
        for j in range(len(learners)):
            sens[j] += sens1[j]
        for j in range(len(learners)):
            spec[j] += spec1[j]
        for j in range(len(learners)):
            fpos[j] += fpos1[j]
        for j in range(len(learners)):
            fneg[j] += fneg1[j]
    for j in range(len(learners)):
        acc[j] = acc[j] / n
    for j in range(len(learners)):
        sens[j] = sens[j] / n
    for j in range(len(learners)):
        spec[j] = spec[j] / n
    for j in range(len(learners)):
        fpos[j] = fpos[j] / n
    for j in range(len(learners)):
        fneg[j] = fneg[j] / n
    return (acc, sens, spec, fpos, fneg)
Пример #13
0
    def randSamp(self, inData, trainingFrac):
        """Use random sampling to partition inData into a training and a test set.
           The seed seem to be reset and the same data partitioning is obtained each time
           the same data is partitioned. """
        indices = orange.MakeRandomIndices2(p0=1-trainingFrac)#trainingFrac)
        indices.randomGenerator = None
        indices.randseed = len(inData)
        selection = indices(inData) 
        train_data = inData.select(selection, 0)
        test_data = inData.select(selection, 1)

        return train_data, test_data
Пример #14
0
    def test_MakeRandomIndices2(self):
        d = orange.ExampleTable("iris")

        inds = orange.MakeRandomIndices2(10, p0=5)
        self.assertEqual(sum(inds), 5)

        inds = orange.MakeRandomIndices2(10, p0=0.5)
        self.assertEqual(sum(inds), 5)

        inds = orange.MakeRandomIndices2(10, p0=0)
        self.assertEqual(sum(inds), 10)

        inds = orange.MakeRandomIndices2(10, p0=1)
        self.assertEqual(sum(inds), 0)

        mr = orange.MakeRandomIndices2(p0=0.3)
        self.assertEqual(sum(mr(10)), 7)

        mr.p0 = 0.9
        inds = mr(d)
        self.assertEqual(sum(inds), 15)
        self.assertEqual(
            len([
                i for i, fold in enumerate(inds)
                if fold == 0 and d[i].getclass() == 0
            ]), 45)

        mr.stratified = mr.Stratification.NotStratified
        inds = mr(d)
        self.assertEqual(sum(inds), 15)
        ## Probably not equal... ;)
        self.assertNotEqual(
            len([
                i for i, fold in enumerate(inds)
                if fold == 0 and d[i].getclass() == 0
            ]), 45)
Пример #15
0
def test_rnd_sampling(data, learners, p=0.7, n=10):
    acc = [0.0] * len(learners)
    for i in range(n):
        selection = orange.MakeRandomIndices2(data, p)
        train_data = data.select(selection, 0)
        test_data = data.select(selection, 1)
        classifiers = []
        for l in learners:
            classifiers.append(l(train_data))
        acc1 = accuracy(test_data, classifiers)
        print "%d: %s" % (i + 1, acc1)
        for j in range(len(learners)):
            acc[j] += acc1[j]
    for j in range(len(learners)):
        acc[j] = acc[j] / n
    return acc
Пример #16
0
def smallRocCurve():

    trainer = Trainer()

    keys = None
    keys = None
    #keys = ["towards"]
    for i, key in enumerate(trainer.annotationEngines):
        if keys != None and not key in keys:
            continue

        print "*****************************************************"
        print key
        engine = trainer.engineMap[key]
        mpl.figure(figsize=(8, 8))
        print "training"
        table = trainer.makeTable(engine)
        cv_indices = orange.MakeRandomIndices2(table, p0=0.75)

        training = table.select(cv_indices, 0, negate=True)
        testing = table.select(cv_indices, 0, negate=False)

        classifier = orangePickle.PickleableClassifier(training,
                                                       orngBayes.BayesLearner)
        #orange.LogRegLearner)
        results = orngTest.testOnData([classifier], testing)

        displayResults(results)

        line = rocCurve(results,
                        "",
                        stepSize=0.001,
                        marker=".",
                        plotArgs=dict(linewidth=5))

        line[0].set_label(engine.name())
        mpl.xlabel("FP", fontsize=25)
        mpl.ylabel("TP", fontsize=25)
        mpl.xticks([0, 1], fontsize=20)
        mpl.yticks([0, 1], fontsize=20)
        ax = mpl.gca()
        ax.set_aspect(1. / ax.get_data_ratio())
        mpl.title(engine.name().capitalize(), fontsize=30)
        #mpl.legend(loc='lower right', prop=FontProperties(size=25))
        mpl.savefig("roc.%s.png" % engine.name())

    mpl.show()
Пример #17
0
def main():
    print "loading"
    annotations = annotation_reader.from_file(
        "%s/data/directions/breadbox/nouns_stefie10.txt" % TKLIB_HOME)
    annotator2 = annotation_reader.from_file(
        "%s/data/directions/breadbox/nouns_dlaude.partial.txt" % TKLIB_HOME)
    #histogram(annotations)
    print "table"
    table = annotations.as_orange_table()
    cv_indices = orange.MakeRandomIndices2(table, p0=0.5)
    print "indices", set(cv_indices)
    print "splitting"
    training, testing = annotation_reader.split(annotations, cv_indices)
    print "features"

    engine = WordnetParentsEngine(training)
    training_table = engine.makeTable(training)
    testing_table = engine.makeTable(testing)

    #training_table, testing_table = wordnet_parents(training, testing)
    #training_table, testing_table = wordnet_glosses(training, testing)
    #training_table, testing_table = flickr_parents(training, testing)

    print len(training_table), "training examples"
    print len(testing_table), "testing examples"

    #training_table = annotation_reader.to_big_small(training_table)
    #testing_table = annotation_reader.to_big_small(testing_table)

    #information_gain = orange.MeasureAttribute_info()
    #for x in training_table.domain.attributes:
    #    print "x", information_gain(x, training_table)

    learners = [
        orange.MajorityLearner(),
        orngEnsemble.RandomForestLearner(), WordnetKnnClassifier,
        agreement.WizardOfOzLearner(annotator2.as_orange_table())
    ]
    results = orngTest.learnAndTestOnTestData(learners, training_table,
                                              testing_table)
    for accuracy, cm in zip(orngStat.CA(results),
                            orngStat.confusionMatrices(results)):
        print orangeUtils.confusion_matrix_to_string(table.domain, cm)
        print "accuracy: %.2f%%" % (accuracy * 100)
Пример #18
0
def learningCurve(learners,
                  examples,
                  cv=None,
                  pick=None,
                  proportions=orange.frange(0.1),
                  pps=[],
                  **argkw):
    verb = argkw.get("verbose", 0)
    cache = argkw.get("cache", 0)
    callback = argkw.get("callback", 0)

    for pp in pps:
        if pp[0] != "L":
            raise SystemError("cannot preprocess testing examples")

    if not cv or not pick:
        seed = argkw.get("indicesrandseed", -1) or argkw.get("randseed", -1)
        if seed:
            randomGenerator = orange.RandomGenerator(seed)
        else:
            randomGenerator = argkw.get("randomGenerator",
                                        orange.RandomGenerator())
        if not cv:
            cv = orange.MakeRandomIndicesCV(
                folds=10,
                stratified=orange.MakeRandomIndices.StratifiedIfPossible,
                randomGenerator=randomGenerator)
        if not pick:
            pick = orange.MakeRandomIndices2(
                stratified=orange.MakeRandomIndices.StratifiedIfPossible,
                randomGenerator=randomGenerator)

    examples, weight = demangleExamples(examples)
    folds = cv(examples)
    ccsum = hex(examples.checksum())[2:]
    ppsp = encodePP(pps)
    nLrn = len(learners)

    allResults = []
    for p in proportions:
        printVerbose("Proportion: %5.3f" % p, verb)

        if (cv.randseed < 0) or (pick.randseed < 0):
            cache = 0
        else:
            fnstr = "{learningCurve}_%s_%s_%s_%s%s-%s" % (
                "%s", p, cv.randseed, pick.randseed, ppsp, ccsum)
            if "*" in fnstr:
                cache = 0

        conv = examples.domain.classVar.varType == orange.VarTypes.Discrete and int or float
        testResults = ExperimentResults(
            cv.folds, [l.name for l in learners],
            examples.domain.classVar.values.native(), weight != 0,
            examples.domain.classVar.baseValue)
        testResults.results = [
            TestedExample(folds[i], conv(examples[i].getclass()), nLrn,
                          examples[i].getweight(weight))
            for i in range(len(examples))
        ]

        if cache and testResults.loadFromFiles(learners, fnstr):
            printVerbose("  loaded from cache", verb)
        else:
            for fold in range(cv.folds):
                printVerbose("  fold %d" % fold, verb)

                # learning
                learnset = examples.selectref(folds, fold, negate=1)
                learnset = learnset.selectref(pick(learnset, p0=p), 0)
                if not len(learnset):
                    continue

                for pp in pps:
                    learnset = pp[1](learnset)

                classifiers = [None] * nLrn
                for i in range(nLrn):
                    if not cache or not testResults.loaded[i]:
                        classifiers[i] = learners[i](learnset, weight)

                # testing
                for i in range(len(examples)):
                    if (folds[i] == fold):
                        # This is to prevent cheating:
                        ex = orange.Example(examples[i])
                        ex.setclass("?")
                        for cl in range(nLrn):
                            if not cache or not testResults.loaded[cl]:
                                cls, pro = classifiers[cl](ex, orange.GetBoth)
                                testResults.results[i].setResult(cl, cls, pro)
                if callback: callback()
            if cache:
                testResults.saveToFiles(learners, fnstr)

        allResults.append(testResults)

    return allResults
Пример #19
0
    def __call__(self, data, weight=None):
        bestSeed = None
        bestAcc = None
        bestNiter = None
        bestModel = None
        #fix self.nDiffIniWeights for the disabled mode
        if self.nDiffIniWeights <= 1:
            self.nDiffIniWeights = 1  #loop over n different initial weights Disabled
        #Fix self.stopUPs for the disabled mode
        if self.stopUPs <= 0:
            self.stopUPs = 0  # Optimization of nIter will be disabled

        self.NTrainEx = len(data)
        #Remove from the domain any unused values of discrete attributes including class
        data = dataUtilities.getDataWithoutUnusedValues(data, True)

        #dataUtilities.rmAllMeta(data)
        if len(data.domain.getmetas()) == 0:
            cleanedData = data
        else:
            cleanedData = dataUtilities.getCopyWithoutMeta(data)
        # Create the imputer
        self.imputer = orange.ImputerConstructor_average(cleanedData)
        # Impute the data
        self.trainData = self.imputer(cleanedData)
        # If we are not seetin neither weights init optimization or nEphocs optimization (opencvLayer), the do nto split the data
        if self.stopUPs != 0 or self.nDiffIniWeights > 1:
            #Define train-80% and validation set-20% of the input data
            indices = orange.MakeRandomIndices2(
                p0=0.2,
                stratified=orange.MakeRandomIndices.StratifiedIfPossible)
            ind = indices(cleanedData)
            self.trainData = cleanedData.select(ind, 1)
            validationSet = cleanedData.select(ind, 0)
        else:
            validationSet = None

        if self.verbose and self.nDiffIniWeights > 1:
            print "=========== Training ", self.nDiffIniWeights, " times with different initial weights =============="
        for n in range(self.nDiffIniWeights):
            if self.nDiffIniWeights <= 1:
                seed = 0  #in opencv  mmlann seed=0 means the seed is disabled, and original seed will be used
            else:
                seed = len(cleanedData) * len(cleanedData.domain) * (
                    n + 1)  #seed can be any integer
            #Create a model with a specific seed for training opencv ANN.
            #Also passing the step for the nIter optimization (self.stopUPs=0 - disable nIter optimization)
            #Also passing the validation set to be used in internal opencv implemented nEphocs optimization.
            model = self.__train__(weight=None,
                                   seed=seed,
                                   validationSet=validationSet)
            #Skip evaluation if the weights loop is disabled
            if self.nDiffIniWeights <= 1:
                return model
                break
            if cleanedData.domain.classVar.varType == orange.VarTypes.Discrete:
                Acc = evalUtilities.getClassificationAccuracy(
                    validationSet, model)
            else:
                Acc = -evalUtilities.getRMSE(validationSet, model)
            if bestModel == None or (Acc > bestAcc) or (
                    Acc == bestAcc and model.nIter < bestNiter):
                bestSeed = seed
                bestAcc = Acc
                bestNiter = model.nIter
                bestModel = model
            if self.verbose:
                print "nIter:%-7s  Acc:%-20s  seed: %s" % (model.nIter, Acc,
                                                           seed)

        if self.verbose:
            print "================ Best model Found: ==================="
        if self.verbose:
            print "nIter:%-7s  Acc:%-20s  seed: %s" % (bestNiter, bestAcc,
                                                       bestSeed)

        # DEBUG for check if the returned model is indeed the best model, and not the last trainted
        #if cleanedData.domain.classVar.varType == orange.VarTypes.Discrete:
        #    Acc = evalUtilities.getClassificationAccuracy(validationSet, bestModel)
        #else:
        #    Acc = -evalUtilities.getRMSE(validationSet, bestModel)
        #if self.verbose: print "================ Best model returned: ==================="
        #if self.verbose:  print "nIter:%-7s  Acc:%-20s  seed: %s" % (bestModel.nIter,Acc,bestModel.seed)

        return bestModel
Пример #20
0
import orange, orngWrap, orngTest, orngStat

data = orange.ExampleTable("bupa")
ri2 = orange.MakeRandomIndices2(data, 0.7)
train = data.select(ri2, 0)
test = data.select(ri2, 1)

bayes = orange.BayesLearner(train)

thresholds = [.2, .5, .8]
models = [orngWrap.ThresholdClassifier(bayes, thr) for thr in thresholds]

res = orngTest.testOnData(models, test)
cm = orngStat.confusionMatrices(res)

print
for i, thr in enumerate(thresholds):
    print "%1.2f: TP %5.3f, TN %5.3f" % (thr, cm[i].TP, cm[i].TN)
Пример #21
0
##res = orngTest.proportionTest(learners, data, 0.7, 100, pps = [("L", classnoise)])
##printResults(res)

print "\nGood old 10-fold cross validation"
res = orngTest.crossValidation(learners, data)
printResults(res)

print "\nLearning curve"
prop = orange.frange(0.2, 1.0, 0.2)
res = orngTest.learningCurveN(learners, data, folds=5, proportions=prop)
for i in range(len(prop)):
    print "%5.3f:" % prop[i],
    printResults(res[i])

print "\nLearning curve with pre-separated data"
indices = orange.MakeRandomIndices2(data, p0=0.7)
train = data.select(indices, 0)
test = data.select(indices, 1)
res = orngTest.learningCurveWithTestData(learners,
                                         train,
                                         test,
                                         times=5,
                                         proportions=prop)
for i in range(len(prop)):
    print "%5.3f:" % prop[i],
    printResults(res[i])

print "\nLearning and testing on pre-separated data"
res = orngTest.learnAndTestOnTestData(learners, train, test)
printResults(res)
Пример #22
0
def ablateFeaturesForCls(engineCls):
    mpl.figure()
    trainer = Trainer()
    engine = engineCls()
    trainer.configureClassifier(engine)
    markers = [
        '.',
        ',',
        'v',
        '^',
        '<',
        '>',
        '1',
        '2',
        '3',
        '4',
        's',
        'p',
        '*',
        'h',
        'H',
    ]
    colors = ["b", "g", "r", "c", "m", "y"]

    sub_engines = []
    for i, name in enumerate(sorted(engine.masterList)):
        sub_engine = engineCls()
        sub_engine.setFeatureList([name])
        sub_engines.append((name, sub_engine))

    markers = markers[0:len(sub_engines)]
    colors = colors[0:len(sub_engines)]
    sub_engines.append(("all", engineCls()))
    markers.append("o")
    colors.append("k")

    for i, (name, sub_engine) in enumerate(sub_engines):
        table = trainer.configureClassifier(sub_engine)
        cv_indices = orange.MakeRandomIndices2(table, p0=0.75)

        training = table.select(cv_indices, 0, negate=True)
        testing = table.select(cv_indices, 0, negate=False)

        #classifier = orange.LogRegLearner(training)
        classifier = orngBayes.BayesLearner(training)
        results = orngTest.testOnData([classifier], testing)
        displayResults(results)

        line = rocCurve(
            results,
            "",
            stepSize=0.001,
            marker=markers[i % len(markers)],
            plotArgs=dict(linewidth=5,
                          markersize=10,
                          color=colors[i % len(colors)]),
        )

        line[0].set_label(name)

    mpl.title(engine.name(), size=30)
    mpl.xlabel("FP", fontsize=30)
    mpl.ylabel("TP", fontsize=30)
    mpl.xticks([0, 1], fontsize=17)
    mpl.yticks([0, 1], fontsize=17)
    mpl.subplots_adjust(bottom=0.14, top=0.91)
    mpl.legend(loc="lower right", prop=dict(size=17))
    mpl.savefig("roc.ablate.%s.png" % engine.name())
        
        self.changedFlag = False

##############################################################################
# Test the widget, run from DOS prompt

if __name__=="__main__":
    a = QApplication(sys.argv)
    ow = OWPredictions()
    ow.show()

    import orngTree

    dataset = orange.ExampleTable('../../doc/datasets/iris.tab')
#    dataset = orange.ExampleTable('../../doc/datasets/auto-mpg.tab')
    ind = orange.MakeRandomIndices2(p0=0.5)(dataset)
    data = dataset.select(ind, 0)
    test = dataset.select(ind, 1)
    testnoclass = orange.ExampleTable(orange.Domain(test.domain.attributes, False), test)        
    tree = orngTree.TreeLearner(data)
    tree.name = "tree"
    maj = orange.MajorityLearner(data)
    maj.name = "maj"
    knn = orange.kNNLearner(data, k = 10)
    knn.name = "knn"
    
#    ow.setData(test)
#    
#    ow.setPredictor(maj, 1)
    
    
Пример #24
0
import orange
import orngClustering

data = orange.ExampleTable("iris")
sample = data.selectref(orange.MakeRandomIndices2(data, 20), 0)
root = orngClustering.hierarchicalClustering(sample)
orngClustering.dendrogram_draw("hclust-dendrogram.png", root, data=sample, labels=[str(d.getclass()) for d in sample]) 

Пример #25
0
# Description: Shows how to sample example by random divisions into two groups
# Category:    sampling
# Classes:     MakeRandomIndices, MakeRandomIndices2, RandomGenerator
# Uses:        lenses
# Referenced:  RandomIndices.htm

import orange

data = orange.ExampleTable("lenses")

indices2 = orange.MakeRandomIndices2(p0=6)

ind = indices2(data)
print ind
data0 = data.select(ind, 0)
data1 = data.select(ind, 1)
print len(data0), len(data1)

print "\nIndices without playing with random generator"
for i in range(5):
    print indices2(data)

print "\nIndices with random generator"
indices2.randomGenerator = orange.RandomGenerator(42)
for i in range(5):
    print indices2(data)

print "\nIndices with randseed"
indices2.randomGenerator = None
indices2.randseed = 42
for i in range(5):
Пример #26
0
def nway():
    engine_to_examples = {}

    trainer = Trainer()
    classes = set()
    for i, key in enumerate(trainer.annotationEngines):
        engine = trainer.engineMap[key]
        table = trainer.makeTable(engine)

        for ex in table:
            if ex["farAway"].value:
                cls = "null"
            else:
                cls = ex["sourceEngineName"].value
            geometry = ex["geometry"].value
            engine_to_examples.setdefault(cls, [])

            classes.add(cls)

            examples = [
                trainer.engineMap[key].makeExample(expectInsane=True,
                                                   **geometry)
                for key in trainer.annotationEngines
                if not len(geometry["figure"]) == 0
            ]

            engine_to_examples[cls].append(examples)

        if i >= 1:
            #break
            pass
    variables = []
    for ex in examples:
        for attr in ex.domain:
            if attr.name == "class":
                continue
            new_attr = orange.FloatVariable(attr.name)
            variables.append(new_attr)
    domain = orange.Domain(variables,
                           orange.EnumVariable("class", values=list(classes)))
    table = orange.ExampleTable(domain)
    for engine_name, example_lists in engine_to_examples.iteritems():
        for example_list in example_lists:
            ex = orange.Example(domain)
            for engine_ex in example_list:
                for attr in engine_ex.domain:
                    ex[attr.name] = engine_ex[attr.name]
            ex["class"] = engine_name
            table.append(ex)
    print "domain", domain

    cv_indices = orange.MakeRandomIndices2(table, p0=0.75)

    training = table.select(cv_indices, 0, negate=True)
    testing = table.select(cv_indices, 0, negate=False)
    #classifier = orngBayes.BayesLearner(training)

    classifier = orangePickle.PickleableClassifier(training,
                                                   orngBayes.BayesLearner)

    results = orngTest.testOnData([classifier], testing)
    print orngStat.CA(results)
    cm = orngStat.confusionMatrices(results)[0]
    classes = list(domain.classVar.values)
    print "           ", " ".join([c.rjust(12) for c in classes + ["", ""]])
    for className, classConfusions in zip(classes, cm):
        #format = ("%s" + ("\t%i" * len(classes)))
        values = (className, ) + tuple(classConfusions)
        print " ".join([str(c).rjust(12) for c in values])
        #print  format % values

    for name in classes:
        classIndex = classes.index(name)
        mpl.figure()
        rocCurve(results,
                 "",
                 classIndex,
                 stepSize=0.001,
                 plotArgs=dict(linewidth=5, markersize=10))
        mpl.title(name, size=30)
        mpl.xlabel("FP", fontsize=30)
        mpl.ylabel("TP", fontsize=30)
        mpl.xticks([0, 1], fontsize=17)
        mpl.yticks([0, 1], fontsize=17)
    fname = "nway.pck"
    print "saving", fname
    with open(fname, "w") as f:
        pickle.dump(classifier, f, protocol=2)
    mpl.show()
Пример #27
0
# Description: Shows how to use the nearest-neighbour learning
# Category:    learning
# Classes:     kNNLearner, kNNClassifier, ExamplesDistance, ExamplesDistanceConstructor
# Uses:        iris
# Referenced:  kNNLearner.htm

import orange, orngTest, orngStat
data = orange.ExampleTable("iris")

rndind = orange.MakeRandomIndices2(data, p0=0.8)
train = data.select(rndind, 0)
test = data.select(rndind, 1)

knn = orange.kNNLearner(train, k=10)
for i in range(5):
    example = test.randomexample()
    print example.getclass(), knn(example)

print "\n\n"
data = orange.ExampleTable("iris")
knn = orange.kNNLearner()
knn.k = 10
knn.distanceConstructor = orange.ExamplesDistanceConstructor_Hamming()
knn = knn(train)
for i in range(5):
    example = test.randomexample()
    print example.getclass(), knn(example)
Пример #28
0
print
classes = vehicle.domain.classVar.values
AUCmatrix = orngStat.AUC_matrix(resVeh)[0]
print "\t"+"\t".join(classes[:-1])
for className, AUCrow in zip(classes[1:], AUCmatrix[1:]):
    print ("%s" + ("\t%5.3f" * len(AUCrow))) % ((className, ) + tuple(AUCrow))

print
print "AUCs for detecting various pairs of classes in 'vehicle'"
for c1, s1 in enumerate(classes):
    for c2 in range(c1):
        print "%s vs %s: \t%5.3f\t%5.3f\t%5.3f" % ((s1, classes[c2]) + tuple(orngStat.AUC_pair(resVeh, c1, c2)))


ri2 = orange.MakeRandomIndices2(voting, 0.6)
train = voting.selectref(ri2, 0)
test = voting.selectref(ri2, 1)
res1 = orngTest.learnAndTestOnTestData(learners, train, test)

print
print "AUC and SE for voting"
AUCs = orngStat.AUCWilcoxon(res1)
for li, lrn in enumerate(learners):
    print "%s: %5.3f+-%5.3f" % (lrn.name, AUCs[li][0], AUCs[li][1])

print
print "Difference between naive Bayes and tree: %5.3f+-%5.3f" % tuple(orngStat.compare2AUCs(res1, 0, 1)[2])

print
print "ROC (first 20 points) for bayes on 'voting'"
Пример #29
0
# Description: Builds regression models from data and outputs predictions for first five instances
# Category:    modelling
# Uses:        housing
# Classes:     MakeRandomIndices2, MajorityLearner, orngTree.TreeLearner, orange.kNNLearner
# Referenced:  regression.htm

import orange, orngTree, orngTest, orngStat

data = orange.ExampleTable("housing.tab")
selection = orange.MakeRandomIndices2(data, 0.5)
train_data = data.select(selection, 0)
test_data = data.select(selection, 1)

maj = orange.MajorityLearner(train_data)
maj.name = "default"

rt = orngTree.TreeLearner(train_data, measure="retis", mForPruning=2, minExamples=20)
rt.name = "reg. tree"

k = 5
knn = orange.kNNLearner(train_data, k=k)
knn.name = "k-NN (k=%i)" % k

regressors = [maj, rt, knn]

print "\n%10s " % "original",
for r in regressors:
  print "%10s " % r.name,
print

for i in range(10):
Пример #30
0
    def findProjection(self,
                       method,
                       attrIndices=None,
                       setAnchors=0,
                       percentDataUsed=100):
        if not self.graph.haveData: return
        ai = self.graph.attributeNameIndex
        if attrIndices == None:
            attributes = self.getShownAttributeList()
            attrIndices = [ai[label] for label in attributes]
        if len(attrIndices) == 0: return None

        validData = self.graph.getValidList(attrIndices)
        if sum(validData) == 0: return None

        dataMatrix = numpy.compress(validData,
                                    numpy.take(
                                        self.graph.noJitteringScaledData,
                                        attrIndices,
                                        axis=0),
                                    axis=1)
        if self.graph.dataHasClass:
            classArray = numpy.compress(
                validData,
                self.graph.noJitteringScaledData[self.graph.dataClassIndex])

        if percentDataUsed != 100:
            indices = orange.MakeRandomIndices2(
                self.graph.rawData, 1.0 - (float(percentDataUsed) / 100.0))
            try:
                dataMatrix = numpy.compress(indices, dataMatrix, axis=1)
            except:
                pass
            if self.graph.dataHasClass:
                classArray = numpy.compress(indices, classArray)

        vectors = None
        if method == DR_PCA:
            vals, vectors = createPCAProjection(
                dataMatrix,
                NComps=2,
                useGeneralizedEigenvectors=self.useGeneralizedEigenvectors)
        elif method == DR_SPCA and self.graph.dataHasClass:
            vals, vectors = createPCAProjection(
                dataMatrix,
                classArray,
                NComps=2,
                useGeneralizedEigenvectors=self.useGeneralizedEigenvectors)
        elif method == DR_PLS and self.graph.dataHasClass:
            dataMatrix = dataMatrix.transpose()
            classMatrix = numpy.transpose(numpy.matrix(classArray))
            vectors = createPLSProjection(dataMatrix, classMatrix, 2)
            vectors = vectors.T

        # test if all values are 0, if there is an invalid number in the array and if there are complex numbers in the array
        if vectors == None or not vectors.any() or False in numpy.isfinite(
                vectors) or False in numpy.isreal(vectors):
            self.setStatusBarText(
                "Unable to compute anchor positions for the selected attributes"
            )
            return None

        xAnchors = vectors[0]
        yAnchors = vectors[1]

        m = math.sqrt(max(xAnchors**2 + yAnchors**2))

        xAnchors /= m
        yAnchors /= m
        names = self.graph.attributeNames
        attributes = [names[attrIndices[i]] for i in range(len(attrIndices))]

        if setAnchors:
            self.graph.setAnchors(list(xAnchors), list(yAnchors), attributes)
            self.graph.updateData()
            self.graph.repaint()
        return xAnchors, yAnchors, (attributes, attrIndices)