def learningCurveN(learners, examples, folds=10, strat=orange.MakeRandomIndices.StratifiedIfPossible, proportions=orange.frange(0.1), pps=[], **argkw): """construct a learning curve for learners""" seed = argkw.get("indicesrandseed", -1) or argkw.get("randseed", -1) if seed: randomGenerator = orange.RandomGenerator(seed) else: randomGenerator = argkw.get("randomGenerator", orange.RandomGenerator()) if strat: cv = orange.MakeRandomIndicesCV(folds=folds, stratified=strat, randomGenerator=randomGenerator) pick = orange.MakeRandomIndices2(stratified=strat, randomGenerator=randomGenerator) else: cv = orange.RandomIndicesCV(folds=folds, stratified=strat, randomGenerator=randomGenerator) pick = orange.RandomIndices2(stratified=strat, randomGenerator=randomGenerator) return learningCurve(*(learners, examples, cv, pick, proportions, pps), **argkw)
def cf(input_dict): # tempfile = open("tempds.tab", 'w') # tempfile.write(input_dict['data']) # tempfile.close() # data = orange.ExampleTable("tempds.tab") data = orange.ExampleTable(input_dict['data']) addMetaID(data) k = 10 noisyIndices = [] selection = orange.MakeRandomIndicesCV(data, folds=k) count_noisy = [0] * k for test_fold in range(k): train_data = data.select(selection, test_fold, negate=1) test_data = data.select(selection, test_fold) #print "\t\t", "Learned on", len(train_data), "examples" #file.flush() classifier = input_dict['learner'](train_data) for example in test_data: if classifier(example) != example.getclass(): # selection_filter[int(example[meta_id].value)] = 0 noisyIndices.append(int(example["meta_id"].value)) count_noisy[test_fold] += 1 # END test_data # END test_fold return noisyIndices
def test_sample(self): d = orange.ExampleTable("iris") li = [1] * 10 + [0] * 140 d1 = d.sample(li) self.assertEqual(len(d1), 10) for i in range(10): self.assertEqual(d1[i], d[i]) self.assertEqual(d1[i].id, d[i].id) d[0, 0] = 42 self.assertEqual(d1[0, 0], 42) d1 = d.sample(li, copy=True) self.assertEqual(len(d1), 10) self.assertEqual(d1[0], d[0]) self.assertNotEqual(d1[0].id, d[0].id) d[0, 0] = 41 self.assertEqual(d1[0, 0], 42) li = [1, 2, 3, 4, 5] * 30 d1 = d.sample(li, 2) self.assertEqual(len(d1), 30) for i in range(30): self.assertEqual(d1[i].id, d[1 + 5 * i].id) ri = orange.MakeRandomIndicesCV(d) for fold in range(10): d1 = d.sample(ri, fold) self.assertEqual(orange.get_class_distribution(d1), [5, 5, 5])
def CVByPairs(data, dimensions=None, method=None, **dic): import orngTree cv = orange.MakeRandomIndicesCV(data, 10) meter = orange.ExamplesDistanceConstructor_Euclidean(data) maxDist = 0 for i in range(100): maxDist = max(maxDist, meter(data.randomexample(), data.randomexample())) weightK = 10.0 / maxDist acc = amb = unre = 0 for fold in range(10): train = data.select(cv, fold, negate=1) test = data.select(cv, fold) pa, qid, did, cid = pade(train, dimensions, method, originalAsMeta=True, **dic) tree = orngTree.TreeLearner(pa, maxDepth=4) tacc, tamb, tunre = computeDirectionAccuracyForPairs( tree, data, meter, weightK, -1) acc += tacc amb += tamb unre += tunre return acc / 10, amb / 10, unre / 10
def cfweka(input_dict, widget, name): from services.webservice import WebService wseval = WebService('http://vihar.ijs.si:8092/Evaluation?wsdl', float(input_dict['timeout'])) wsutil = WebService('http://vihar.ijs.si:8092/Utilities?wsdl', float(input_dict['timeout'])) somelearner = input_dict['learner'] print somelearner data = input_dict['data'] # arffstr = toARFFstring(data).getvalue() # #print arffstr # wekaInstances = wsutil.client.arff_to_weka_instances(arff = arffstr, class_index = odt.domain.index(odt.domain.classVar)) # #print wekaInstances # model = wseval.client.build_classifier(learner = somelearner, instances = wekaInstances['instances']) # #return {} # addMetaID(data) k = int(input_dict['k_folds']) noisyIndices = [] selection = orange.MakeRandomIndicesCV(data, folds=k) count_noisy = [0] * k for test_fold in range(k): train_arffstr = toARFFstring( data.select(selection, test_fold, negate=1)).getvalue() train_data = wsutil.client.arff_to_weka_instances( arff=train_arffstr, class_index=data.domain.index(data.domain.classVar))['instances'] test_inds = [ i for i in range(len(selection)) if selection[i] == test_fold ] test_arffstr = toARFFstring(data.select(selection, test_fold)).getvalue() test_data = wsutil.client.arff_to_weka_instances( arff=test_arffstr, class_index=data.domain.index(data.domain.classVar))['instances'] #print "\t\t", "Learned on", len(train_data), "examples" #file.flush() print "pred cl build" classifier = wseval.client.build_classifier( learner=somelearner, instances=train_data)['classifier'] print "po cl build" eval_test_data = wseval.client.apply_classifier(classifier=classifier, instances=test_data) print "po eval" for i in range(len(eval_test_data)): #print "Test data length:", len(test_data), "Test inds length:", len(test_inds), "Eval Test data length:", len(eval_test_data) print i, "v for zanki", eval_test_data[i]['classes'], data[ test_inds[i]].getclass() if eval_test_data[i]['classes'] != unicode( data[test_inds[i]].getclass()): # selection_filter[int(example[meta_id].value)] = 0 noisyIndices.append(test_inds[i]) count_noisy[test_fold] += 1 # END test_data widget.progress = int((test_fold + 1) * 1.0 / k * 100) widget.save() # END test_fold return {'inds': sorted(noisyIndices), 'name': getWekaName(name)}
def cf_run(learner, data, k_folds, name, widget=None): """Runs a classification filter :param learner: WekaClassifier :param data: Orange dataset :param k_folds: :param name: :param timeout: :param widget: :return: """ somelearner = learner print somelearner noisyIndices = [] selection = orange.MakeRandomIndicesCV(data, folds=k_folds) count_noisy = [0] * k_folds for test_fold in range(k_folds): # train_data = wsutil.client.arff_to_weka_instances(arff = train_arffstr, class_index = data.domain.index(data.domain.classVar))['instances'] train_data = convert_dataset_from_orange_to_scikit( data.select(selection, test_fold, negate=1)) test_inds = [ i for i in range(len(selection)) if selection[i] == test_fold ] # test_data = wsutil.client.arff_to_weka_instances(arff = test_arffstr, class_index = data.domain.index(data.domain.classVar))['instances'] test_data = convert_dataset_from_orange_to_scikit( data.select(selection, test_fold)) #print "\t\t", "Learned on", len(train_data), "examples" #file.flush() print "before cl build" # classifier = wseval.client.build_classifier(learner = somelearner, instances = train_data)['classifier'] learner.build_classifier(train_data) print "after cl build" # eval_test_data = wseval.client.apply_classifier(classifier = classifier, instances = test_data) scikit_dataset_predicted = learner.apply_classifier(test_data) print "after apply" for i in range(len(scikit_dataset_predicted.target)): #print "Test data length:", len(test_data), "Test inds length:", len(test_inds), "Eval Test data length:", len(eval_test_data) # print i, "v for zanki", eval_test_data[i]['classes'], data[test_inds[i]].getclass() # if eval_test_data[i]['classes'] != unicode(data[test_inds[i]].getclass()): if scikit_dataset_predicted.target[ i] != scikit_dataset_predicted.targetPredicted[i]: # selection_filter[int(example[meta_id].value)] = 0 noisyIndices.append(test_inds[i]) count_noisy[test_fold] += 1 # END test_data if not (widget is None): widget.progress = int((test_fold + 1) * 1.0 / k_folds * 100) widget.save() # END test_fold return {'inds': sorted(noisyIndices), 'name': get_weka_name(name)}
def test_MakeRandomIndicesCV(self): d = orange.ExampleTable("iris") inds = orange.MakeRandomIndicesCV(100) for j in range(10): self.assertEqual(len([i for i in inds if i == j]), 10) inds = orange.MakeRandomIndicesCV(103) for j in range(3): self.assertEqual(len([i for i in inds if i == j]), 11) inds = orange.MakeRandomIndicesCV(100, folds=100) self.assertEqual(len([i for i in inds if not i]), 1) # Check that five of each iris types get into each fold mr = orange.MakeRandomIndicesCV() inds = mr(d) for j in range(10): self.assertEqual(len([i for i in inds if i == j]), 15) sel = [d[i].getclass() for i, fold in enumerate(inds) if fold == j] for k in range(2): self.assertEqual(len([i for i in sel if i == k]), 5)
def crossValidation(learners, examples, folds=10, strat=orange.MakeRandomIndices.StratifiedIfPossible, pps=[], indicesrandseed="*", **argkw): """cross-validation evaluation of learners""" (examples, weight) = demangleExamples(examples) if indicesrandseed != "*": indices = orange.MakeRandomIndicesCV(examples, folds, randseed=indicesrandseed, stratified=strat) else: randomGenerator = argkw.get("randseed", 0) or argkw.get( "randomGenerator", 0) indices = orange.MakeRandomIndicesCV(examples, folds, stratified=strat, randomGenerator=randomGenerator) return testWithIndices(learners, (examples, weight), indices, indicesrandseed, pps, **argkw)
def cross_validation(data, learners, k=10): ar = [0.0] * len(learners) selection = orange.MakeRandomIndicesCV(data, folds=k) for test_fold in range(k): train_data = data.select(selection, test_fold, negate=1) test_data = data.select(selection, test_fold) classifiers = [] for l in learners: classifiers.append(l(train_data)) result = aroc(test_data, classifiers) for j in range(len(learners)): ar[j] += result[j] for j in range(len(learners)): ar[j] = ar[j] / k return ar
def cross_validation(data, learners, k=10): acc = [0.0] * len(learners) selection = orange.MakeRandomIndicesCV(data, folds=k) for test_fold in range(k): train_data = data.select(selection, test_fold, negate=1) test_data = data.select(selection, test_fold) classifiers = [] for l in learners: classifiers.append(l(train_data)) acc1 = accuracy(test_data, classifiers) print "%d: %s" % (test_fold + 1, ["%.6f" % a for a in acc1]) for j in range(len(learners)): acc[j] += acc1[j] for j in range(len(learners)): acc[j] = acc[j] / k return acc
def CVByNodes(data, dimensions=None, method=None, **dic): import orngTree cv = orange.MakeRandomIndicesCV(data, 10) for fold in range(10): train = data.select(cv, fold, negate=1) test = data.select(cv, fold) pa, qid, did, cid = pade(train, dimensions, method, originalAsMeta=True, **dic) tree = orngTree.TreeLearner(pa, maxDepth=4) mb, cc = computeAmbiguityAccuracy(tree, test, -1) amb += mb acc += cc return amb / 10, acc / 10
def cf_run_harf(learner, data_orange, k_folds, widget=None): """Classification filter for HARF learner :param learner: :param data_orange: :param k_folds: :param widget: :return: """ somelearner = learner print "Before generate" learner = somelearner if not isinstance( somelearner, UnpicklableObject) else somelearner.generate() print "After generate" # data_orange = input_dict['data_orange'] print len(data_orange) add_meta_id(data_orange) print 'Before for loop' k = k_folds noisyIndices = [] selection = orange.MakeRandomIndicesCV(data_orange, folds=k) count_noisy = [0] * k print 'Before for loop' for test_fold in range(k): train_data = data_orange.select(selection, test_fold, negate=1) test_data = data_orange.select(selection, test_fold) #print "\t\t", "Learned on", len(train_data), "examples" #file.flush() print 'Before classifier construction' #print learner.hovername if learner.hovername != None else "ni hovernamea" classifier = learner(train_data) print 'After classifier construction' for example in test_data: exclassified = classifier(example) if exclassified != None and exclassified != example.getclass(): # selection_filter[int(example[meta_id].value)] = 0 noisyIndices.append(int(example["meta_id"].value)) count_noisy[test_fold] += 1 # END test_data if not (widget is None): widget.progress = int((test_fold + 1) * 1.0 / k * 100) widget.save() # END test_fold return {'inds': sorted(noisyIndices), 'name': learner.name}
def __call__(self, table, weight=None, verbose=0): import orngTest, orngStat, orngMisc verbose = verbose or getattr(self, "verbose", 0) evaluate = getattr(self, "evaluate", orngStat.CA) folds = getattr(self, "folds", 5) compare = getattr(self, "compare", lambda x, y: (x > y) - (x < y)) returnWhat = getattr(self, "returnWhat", Tune1Parameter.returnClassifier) if (type(self.parameter) == list) or (type(self.parameter) == tuple): to_set = [self.findobj(ld) for ld in self.parameter] else: to_set = [self.findobj(self.parameter)] cvind = orange.MakeRandomIndicesCV(table, folds) findBest = orngMisc.BestOnTheFly(seed=table.checksum(), callCompareOn1st=True) tableAndWeight = weight and (table, weight) or table for par in self.values: for i in to_set: setattr(i[0], i[1], par) res = evaluate( orngTest.testWithIndices([self.object], tableAndWeight, cvind)) findBest.candidate((res, par)) if verbose == 2: print('*** orngWrap %s: %s:' % (par, res)) bestpar = findBest.winner()[1] for i in to_set: setattr(i[0], i[1], bestpar) if verbose: print("*** Optimal parameter: %s = %s" % (self.parameter, bestpar)) if returnWhat == Tune1Parameter.returnNone: return None elif returnWhat == Tune1Parameter.returnParameters: return bestpar elif returnWhat == Tune1Parameter.returnLearner: return self.object else: classifier = self.object(table) classifier.setattr("fittedParameter", bestpar) return classifier
def crossValidateWithSeparateTrainingAndTesting(training, testing, learner, folds=10): assert len(training) == len(testing) indices = orange.MakeRandomIndicesCV(training, folds=folds) cm = orngStat.ConfusionMatrix() for i in range(folds): trainingFold = training.select(indices, i, negate=1) testingFold = testing.select(indices, i) results = orngTest.learnAndTestOnTestData([learner], trainingFold, testingFold) fCm = orngStat.confusionMatrices(results, classIndex=0)[0] cm.TP += fCm.TP cm.FP += fCm.FP cm.FN += fCm.FN cm.TN += fCm.TN return cm
def testClassAccuracies(data, learner, k=5): classes = data.domain.classVar.values classAccuracies = [0.0] * len(classes) selection = orange.MakeRandomIndicesCV(data, folds=k) for testFold in range(k): trainData = data.select(selection, testFold, negate=1) testData = data.select(selection, testFold) hits = [0.] * len(classes) totals = [0.] * len(classes) classifier = learner(trainData) for ex in testData: totals[int(ex.getclass())] += 1 if (classifier(ex) == ex.getclass()): hits[int(ex.getclass())] += 1 for i in range(len(classes)): classAccuracies[i] += hits[i] / totals[i] for i in range(len(classes)): print "%s: %.4f" % (classes[i], classAccuracies[i] / k)
def cv_split(context, folds=10, random_seed=None): ''' Returns a list of pairs (train_context, test_context), one for each cross-validation fold. The split is stratified. :param context: DBContext to be split :param folds: number of folds :param random_seed: random seed to be used :return: returns a list of (train_context, test_context) pairs :rtype: list :Example: >>> for train_context, test_context in cv_split(context, folds=10, random_seed=0): >>> pass # Your CV loop ''' import orange random_seed = random.randint(0, 10**6) if not random_seed else random_seed input_list = context.orng_tables.get(context.target_table, None) indices = orange.MakeRandomIndicesCV( input_list, randseed=random_seed, folds=folds, stratified=orange.MakeRandomIndices.Stratified) fold_contexts = [] for i in range(folds): train = input_list.select(indices, i, negate=1) test = input_list.select(indices, i) train.name = input_list.name test.name = input_list.name train_context = context.copy() train_context.orng_tables[context.target_table] = train test_context = context.copy() test_context.orng_tables[context.target_table] = test fold_contexts.append((train_context, test_context)) return fold_contexts
def cforange(input_dict, widget): from workflows.helpers import UnpicklableObject somelearner = input_dict['learner'] print "Before generate" learner = somelearner if not isinstance( somelearner, UnpicklableObject) else somelearner.generate() print "After generate" data = input_dict['data'] print len(data) addMetaID(data) print 'Before for loop' k = int(input_dict['k_folds']) noisyIndices = [] selection = orange.MakeRandomIndicesCV(data, folds=k) count_noisy = [0] * k print 'Before for loop' for test_fold in range(k): train_data = data.select(selection, test_fold, negate=1) test_data = data.select(selection, test_fold) #print "\t\t", "Learned on", len(train_data), "examples" #file.flush() print 'Before classifier construction' #print learner.hovername if learner.hovername != None else "ni hovernamea" classifier = learner(train_data) print 'After classifier construction' for example in test_data: exclassified = classifier(example) if exclassified != None and exclassified != example.getclass(): # selection_filter[int(example[meta_id].value)] = 0 noisyIndices.append(int(example["meta_id"].value)) count_noisy[test_fold] += 1 # END test_data widget.progress = int((test_fold + 1) * 1.0 / k * 100) widget.save() # END test_fold return {'inds': sorted(noisyIndices), 'name': learner.name}
# Description: Constructs indices for cross-validation # Category: sampling # Classes: MakeRandomIndices, MakeRandomIndicesCV # Uses: lenses # Referenced: RandomIndices.htm import orange data = orange.ExampleTable("lenses") print orange.MakeRandomIndicesCV(data) print orange.MakeRandomIndicesCV(10, folds=5)
def learningCurve(learners, examples, cv=None, pick=None, proportions=orange.frange(0.1), pps=[], **argkw): verb = argkw.get("verbose", 0) cache = argkw.get("cache", 0) callback = argkw.get("callback", 0) for pp in pps: if pp[0] != "L": raise SystemError("cannot preprocess testing examples") if not cv or not pick: seed = argkw.get("indicesrandseed", -1) or argkw.get("randseed", -1) if seed: randomGenerator = orange.RandomGenerator(seed) else: randomGenerator = argkw.get("randomGenerator", orange.RandomGenerator()) if not cv: cv = orange.MakeRandomIndicesCV( folds=10, stratified=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=randomGenerator) if not pick: pick = orange.MakeRandomIndices2( stratified=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=randomGenerator) examples, weight = demangleExamples(examples) folds = cv(examples) ccsum = hex(examples.checksum())[2:] ppsp = encodePP(pps) nLrn = len(learners) allResults = [] for p in proportions: printVerbose("Proportion: %5.3f" % p, verb) if (cv.randseed < 0) or (pick.randseed < 0): cache = 0 else: fnstr = "{learningCurve}_%s_%s_%s_%s%s-%s" % ( "%s", p, cv.randseed, pick.randseed, ppsp, ccsum) if "*" in fnstr: cache = 0 conv = examples.domain.classVar.varType == orange.VarTypes.Discrete and int or float testResults = ExperimentResults( cv.folds, [l.name for l in learners], examples.domain.classVar.values.native(), weight != 0, examples.domain.classVar.baseValue) testResults.results = [ TestedExample(folds[i], conv(examples[i].getclass()), nLrn, examples[i].getweight(weight)) for i in range(len(examples)) ] if cache and testResults.loadFromFiles(learners, fnstr): printVerbose(" loaded from cache", verb) else: for fold in range(cv.folds): printVerbose(" fold %d" % fold, verb) # learning learnset = examples.selectref(folds, fold, negate=1) learnset = learnset.selectref(pick(learnset, p0=p), 0) if not len(learnset): continue for pp in pps: learnset = pp[1](learnset) classifiers = [None] * nLrn for i in range(nLrn): if not cache or not testResults.loaded[i]: classifiers[i] = learners[i](learnset, weight) # testing for i in range(len(examples)): if (folds[i] == fold): # This is to prevent cheating: ex = orange.Example(examples[i]) ex.setclass("?") for cl in range(nLrn): if not cache or not testResults.loaded[cl]: cls, pro = classifiers[cl](ex, orange.GetBoth) testResults.results[i].setResult(cl, cls, pro) if callback: callback() if cache: testResults.saveToFiles(learners, fnstr) allResults.append(testResults) return allResults
def __call__(self, examples, weightID=0, **kwds): import orngTest, orngStat, statc self.__dict__.update(kwds) if self.removeThreshold < self.addThreshold: raise "'removeThreshold' should be larger or equal to 'addThreshold'" classVar = examples.domain.classVar indices = orange.MakeRandomIndicesCV(examples, folds=getattr(self, "folds", 10)) domain = orange.Domain([], classVar) res = orngTest.testWithIndices([self.learner], orange.ExampleTable(domain, examples), indices) oldStat = self.stat(res)[0] oldStats = [self.stat(x)[0] for x in orngStat.splitByIterations(res)] print ".", oldStat, domain stop = False while not stop: stop = True if len(domain.attributes) >= 2: bestStat = None for attr in domain.attributes: newdomain = orange.Domain( filter(lambda x: x != attr, domain.attributes), classVar) res = orngTest.testWithIndices( [self.learner], (orange.ExampleTable(newdomain, examples), weightID), indices) newStat = self.stat(res)[0] newStats = [ self.stat(x)[0] for x in orngStat.splitByIterations(res) ] print "-", newStat, newdomain ## If stat has increased (ie newStat is better than bestStat) if not bestStat or cmp(newStat, bestStat) == self.statsign: if cmp(newStat, oldStat) == self.statsign: bestStat, bestStats, bestAttr = newStat, newStats, attr elif statc.wilcoxont( oldStats, newStats)[1] > self.removeThreshold: bestStat, bestAttr, bestStats = newStat, newStats, attr if bestStat: domain = orange.Domain( filter(lambda x: x != bestAttr, domain.attributes), classVar) oldStat, oldStats = bestStat, bestStats stop = False print "removed", bestAttr.name bestStat, bestAttr = oldStat, None for attr in examples.domain.attributes: if not attr in domain.attributes: newdomain = orange.Domain(domain.attributes + [attr], classVar) res = orngTest.testWithIndices( [self.learner], (orange.ExampleTable(newdomain, examples), weightID), indices) newStat = self.stat(res)[0] newStats = [ self.stat(x)[0] for x in orngStat.splitByIterations(res) ] print "+", newStat, newdomain ## If stat has increased (ie newStat is better than bestStat) if cmp(newStat, bestStat) == self.statsign and statc.wilcoxont( oldStats, newStats)[1] < self.addThreshold: bestStat, bestStats, bestAttr = newStat, newStats, attr if bestAttr: domain = orange.Domain(domain.attributes + [bestAttr], classVar) oldStat, oldStats = bestStat, bestStats stop = False print "added", bestAttr.name return self.learner(orange.ExampleTable(domain, examples), weightID)
def __call__(self, table, weight=None, verbose=0): import orngTest, orngStat, orngMisc evaluate = getattr(self, "evaluate", orngStat.CA) folds = getattr(self, "folds", 5) compare = getattr(self, "compare", cmp) verbose = verbose or getattr(self, "verbose", 0) returnWhat = getattr(self, "returnWhat", Tune1Parameter.returnClassifier) progressCallback = getattr(self, "progressCallback", lambda i: None) to_set = [] parnames = [] for par in self.parameters: if (type(par[0]) == list) or (type(par[0]) == tuple): to_set.append([self.findobj(ld) for ld in par[0]]) parnames.append(par[0]) else: to_set.append([self.findobj(par[0])]) parnames.append([par[0]]) cvind = orange.MakeRandomIndicesCV(table, folds) findBest = orngMisc.BestOnTheFly(seed=table.checksum(), callCompareOn1st=True) tableAndWeight = weight and (table, weight) or table numOfTests = sum([len(x[1]) for x in self.parameters]) milestones = set(range(0, numOfTests, max(numOfTests / 100, 1))) for itercount, valueindices in enumerate( orngMisc.LimitedCounter([len(x[1]) for x in self.parameters])): values = [ self.parameters[i][1][x] for i, x in enumerate(valueindices) ] for pi, value in enumerate(values): for i, par in enumerate(to_set[pi]): setattr(par[0], par[1], value) if verbose == 2: print("%s: %s" % (parnames[pi][i], value)) res = evaluate( orngTest.testWithIndices([self.object], tableAndWeight, cvind)) if itercount in milestones: progressCallback(100.0 * itercount / numOfTests) findBest.candidate((res, values)) if verbose == 2: print("===> Result: %s\n" % res) bestpar = findBest.winner()[1] if verbose: print("*** Optimal set of parameters: ", end=' ') for pi, value in enumerate(bestpar): for i, par in enumerate(to_set[pi]): setattr(par[0], par[1], value) if verbose: print("%s: %s" % (parnames[pi][i], value), end=' ') if verbose: print() if returnWhat == Tune1Parameter.returnNone: return None elif returnWhat == Tune1Parameter.returnParameters: return bestpar elif returnWhat == Tune1Parameter.returnLearner: return self.object else: classifier = self.object(table) classifier.fittedParameters = bestpar return classifier
#whole_table = proj_utils.load_data(sys.argv[1]) #start_domain = Orange.data.Domain(whole_table.domain.attributes[4:]) #start_data = Orange.data.Table(start_domain, whole_table) start_data = proj_utils.load_data(sys.argv[1]) cv_folds = int(sys.argv[2]) features = int(sys.argv[3]) # default scoring algorithm scores = Orange.feature.scoring.score_all(start_data) data = Orange.feature.selection.select(start_data, scores, features) train_data, test_data = proj_utils.partition_data(data) selection = orange.MakeRandomIndicesCV(data, cv_folds) sen1 = 0.0 spe1 = 0.0 acc1 = 0.0 sen2 = 0.0 spe2 = 0.0 acc2 = 0.0 results = orngTest.crossValidation(learners, data, cv_folds=5) """ based on http://orange.biolab.si/doc/ofb/c_performance.htm """ for test_fold in range(cv_folds): train_data = data.select(selection, test_fold, negate=1) test_data = data.select(selection, test_fold)
def __call__(self, examples, weight = 0): if not(examples.domain.classVar.varType == 1 and len(examples.domain.classVar.values)==2): # failing the assumptions of margin-metalearner... return MarginMetaClassifierWrap(self.learner(examples)) mv = orange.FloatVariable(name="margin") estdomain = orange.Domain([mv,examples.domain.classVar]) mistakes = orange.ExampleTable(estdomain) if weight != 0: mistakes.addMetaAttribute(1) for replication in range(self.replications): # perform 10 fold CV, and create a new dataset try: selection = orange.MakeRandomIndicesCV(examples, self.folds, stratified=0, randomGenerator = orange.globalRandom) # orange 2.2 except: selection = orange.RandomIndicesCVGen(examples, self.folds) # orange 2.1 for fold in range(self.folds): if self.folds != 1: # no folds learn_data = examples.selectref(selection, fold, negate=1) test_data = examples.selectref(selection, fold) else: learn_data = examples test_data = examples # fulldata removes the influence of scaling on the distance dispersion. if weight!=0: if self.fulldata: classifier = self.learner(learn_data, weight=weight, fulldata=examples) else: classifier = self.learner(learn_data, weight=weight) else: if self.fulldata: classifier = self.learner(learn_data, fulldata=examples) else: classifier = self.learner(learn_data) # normalize the range if self.normalization: mi = 1e100 ma = -1e100 for ex in learn_data: margin = classifier.getmargin(ex) mi = min(mi,margin) ma = max(ma,margin) coeff = 1.0/max(ma-mi,1e-16) else: coeff = 1.0 for ex in test_data: margin = coeff*classifier.getmargin(ex) if type(margin)==type(1.0) or type(margin)==type(1): # ignore those examples which are handled with # the actual probability distribution mistake = orange.Example(estdomain,[float(margin), ex.getclass()]) if weight!=0: mistake.setmeta(ex.getMetaAttribute(weight),1) mistakes.append(mistake) if len(mistakes) < 1: # nothing to learn from if weight == 0: return self.learner(examples) else: return self.learner(examples,weight) if weight != 0: # learn a classifier to estimate the probabilities from margins # learn a classifier for the whole training set estimate = self.metalearner(mistakes, weight = 1) classifier = self.learner(examples, weight) else: estimate = self.metalearner(mistakes) classifier = self.learner(examples) # normalize the range if self.normalization: mi = 1e100 ma = -1e100 for ex in examples: margin = classifier.getmargin(ex) mi = min(mi,margin) ma = max(ma,margin) coeff = 1.0/max(ma-mi,1e-16) else: coeff = 1.0 #print estimate.classifier.classifier #for x in mistakes: # print x,estimate(x,orange.GetBoth) return MarginMetaClassifier(classifier, estimate, examples.domain, estdomain, coeff)
# Description: Adds two new numerical attributes to iris data set, and tests through cross validation if this helps in boosting classification accuracy # Category: modelling # Uses: iris # Classes: Domain, FloatVariable, MakeRandomIndicesCV, orngTest.testWithIndices # Referenced: domain.htm import orange, orngTest, orngStat, orngTree data = orange.ExampleTable('iris') sa = orange.FloatVariable("sepal area") sa.getValueFrom = lambda e, getWhat: e['sepal length'] * e['sepal width'] pa = orange.FloatVariable("petal area") pa.getValueFrom = lambda e, getWhat: e['petal length'] * e['petal width'] newdomain = orange.Domain(data.domain.attributes + [sa, pa, data.domain.classVar]) newdata = data.select(newdomain) learners = [orngTree.TreeLearner(mForPruning=2.0)] indices = orange.MakeRandomIndicesCV(data, 10) res1 = orngTest.testWithIndices(learners, data, indices) res2 = orngTest.testWithIndices(learners, newdata, indices) print "original: %5.3f, new: %5.3f" % (orngStat.CA(res1)[0], orngStat.CA(res2)[0])
def main(): from sys import argv map_fn = argv[1] gtruth_tag_fn = argv[2] cluster_fn = argv[3] assignment_fns = argv[4:] tagFile = tag_util.tag_file(gtruth_tag_fn, map_fn) tagFile.get_map() tagFile.get_tag_names() skeleton = carmen_map_skeletonizer.load(cluster_fn, map_fn) assignments = [Assignment.load(assignment_fn, tagFile, skeleton) for assignment_fn in assignment_fns] engineMap = dict((x.name, x) for x in [bring.Engine(), follow.Engine(), meet.Engine(), avoid.Engine(), #wander.Engine(), #go.Engine(), ]) for engine in engineMap.values(): verb = engine.name if verb != "follow" and False: continue def run(): return makeTable(engine, assignments) #cProfile.runctx("run()", globals(), locals(), "profile.out") #return table = run() print "verb", verb, len(table) cv_indices = orange.MakeRandomIndicesCV(table, 2) humanLabeledTraining = table.select(cv_indices, 0) training = orange.ExampleTable(humanLabeledTraining.domain) training.extend(humanLabeledTraining) generatedTraining = makeSubsetExamples(engine, humanLabeledTraining) training.extend(generatedTraining) print "Using", len(generatedTraining), "subset examples" testing = table.select(cv_indices, 1) #testFeatureSubsets(engine, training, testing) #classifier = orngBayes.BayesLearner(training) classifier = RandomForestLearner(training) results = orngTest.testOnData([classifier], testing) print "results", results tuples = list(zip(testing, results.results)) tuples.sort(key=lambda x: x[0]["description"]) for e, r in tuples: # print e["description"], e["hasApproach"], e["hasFollow"], if r.actualClass == r.classes[0]: print "correct", e["description"], e["entry"].value.id else: print "incorrect", e["description"], e["entry"].value.id mpl.figure(figsize=(6,6)) mpl.subplots_adjust(bottom=0.13) line, = orangeGui.rocCurve(results, engine.name, stepSize=0.001, plotArgs={"color":"black"}) orangeUtils.displayResults(results) mpl.xlabel("FP", fontsize=32) mpl.ylabel("TP", fontsize=32) mpl.xticks((0, 1), fontsize=20) mpl.yticks((0, 1), fontsize=20) line.set_label(engine.name) mpl.title(engine.name.capitalize(), fontsize=32) mpl.savefig("roc_%s.png" % engine.name) mpl.savefig("roc_%s.ps" % engine.name) mpl.show()
def run(self): self.cleanup() if self.is_for_loop(): fi = None fo = None for w in self.widgets: if w.type == 'for_input': fi = w if w.type == 'for_output': fo = w outer_output = self.parent.outputs[fo.inputs.all() [0].outer_output_id] outer_output.value = [] input_list = self.parent.inputs[fi.outputs.all() [0].outer_input_id].value for i in input_list: self.cleanup() proper_output = fi.outputs.all()[0] proper_output.value = i fi.finished = True self.run_all_unfinished_widgets() elif self.is_cross_validation(): import random as rand fi = None fo = None for w in self.widgets: if w.type == 'cv_input': fi = w if w.type == 'cv_output': fo = w outer_output = self.parent.outputs[fo.inputs.all() [0].outer_output_id] outer_output.value = [] input_list = self.parent.inputs[fi.outputs.all() [0].outer_input_id].value input_fold = self.parent.inputs[fi.outputs.all() [1].outer_input_id].value input_seed = self.parent.inputs[fi.outputs.all() [2].outer_input_id].value if input_fold != None: input_fold = int(input_fold) else: input_fold = 10 if input_seed != None: input_seed = int(input_seed) else: input_seed = random.randint(0, 10**9) input_type = input_list.__class__.__name__ context = None if input_type == 'DBContext': context = input_list input_list = context.orng_tables.get(context.target_table, None) elif input_type == 'DocumentCorpus': document_corpus = input_list input_list = document_corpus.documents if not input_list: raise Exception('CrossValidation: Empty input list!') folds = [] if hasattr(input_list, "get_items_ref"): import orange indices = orange.MakeRandomIndicesCV( input_list, randseed=input_seed, folds=input_fold, stratified=orange.MakeRandomIndices.Stratified) for i in range(input_fold): output_train = input_list.select(indices, i, negate=1) output_test = input_list.select(indices, i) output_train.name = input_list.name output_test.name = input_list.name folds.append((output_train, output_test)) elif input_type == 'DocumentCorpus': from sklearn.cross_validation import StratifiedKFold, KFold if 'Labels' in document_corpus.features: labels = document_corpus.get_document_labels() #print "Seed:"+str(input_seed) stf = StratifiedKFold(labels, n_folds=input_fold, random_state=input_seed) else: stf = KFold(len(document_corpus.documents), n_folds=input_fold, random_state=input_seed) folds = [(list(train_index), list(test_index)) for train_index, test_index in stf] else: rand.seed(input_seed) rand.shuffle(input_list) folds = [input_list[i::input_fold] for i in range(input_fold)] proper_output = fi.outputs.all()[2] proper_output.value = input_seed for i in range(len(folds)): #import pdb; pdb.set_trace() if hasattr(input_list, "get_items_ref"): output_test = folds[i][1] output_train = folds[i][0] elif input_type == 'DocumentCorpus': train_indices, test_indices = folds[i] print "engine" print("TRAIN:", train_indices, "TEST:", test_indices) output_train, output_test = document_corpus.split( train_indices, test_indices) else: output_train = folds[:i] + folds[i + 1:] output_test = folds[i] if input_type == 'DBContext': output_train_obj = context.copy() output_train_obj.orng_tables[ context.target_table] = output_train output_test_obj = context.copy() output_test_obj.orng_tables[ context.target_table] = output_test output_train = output_train_obj output_test = output_test_obj self.cleanup() proper_output = fi.outputs.all()[0] # inner output proper_output.value = output_train proper_output = fi.outputs.all()[1] # inner output proper_output.value = output_test fi.finished = True # set the input widget as finished self.run_all_unfinished_widgets() else: self.run_all_unfinished_widgets() self.save()