def prune_sf(data, minExmplsInLeaf, progress_steps, widget=None): """Prune Saturation Filter :param data: :param minExmplsInLeaf: :param progress_steps: :param widget: :return: """ print "\t", "Pruning + Saturation Filter:" #file.flush() classifier = orngTree.TreeLearner(data, sameMajorityPruning=1, mForPruning=0, storeExamples=1) print "\t\t", "Classifier complexity:\t", orngTree.countNodes( classifier), "nodes." #file.flush() ## [noisyA, dataset] = exclude_pruned(data, classifier, minExmplsInLeaf) [noisePruned, dataset] = exclude_pruned(data, classifier, minExmplsInLeaf) print "\t\t", len(noisePruned), "example(s) were excluded by pruning." #file.flush() classifier2 = orngTree.TreeLearner(dataset, sameMajorityPruning=1, mForPruning=0, storeExamples=1) print "\t\t", "Pruned Classifier complexity:", orngTree.countNodes( classifier2), "nodes. " #file.flush() # Saturation filtering ## [noisy_data, filtered_data] = saturation(dataset, "tree") n = len(data) #widget.progress = int(len(noisePruned)*1.0/len(data)*100) if not (widget == None): widget.progress = int( sum([n - i for i in range(len(noisePruned))]) * 1.0 / progress_steps * 100) widget.save() print "progress:", widget.progress #[noiseSF, filtered_data] = saturation(dataset, widget)#, "tree") noiseSF = saturation(dataset, widget) #, "tree") #print "\t\t", "Size of filtered dataset:", len(filtered_data) print "\t\t", "Noisy examples (", len(noiseSF["inds"])+len(noisePruned),"(",len(noisePruned),"pruned,",\ len(noiseSF["inds"]), "SF ))\n"#: (class, id)" #file.flush() #noisy_data.sort(meta_id) #noiseSF.sort() # Merge both obtained sets of noisy examples #noisyA.extend(noisy_data) noisePruned.extend(noiseSF["inds"]) #return noisyA return {"inds": sorted(noisePruned), "name": "PruneSF"}
def main(phase, make): if (phase == 4): f = FeatureExtractor2.FeatureExtractor(createFile=make) ft = FeatureExtractor3.FeatureExtractor(createFile=make) idlist = f.IDs idlist2 = ft.IDs FeatureTable = orange.ExampleTable("table2") TestTable = orange.ExampleTable("table3") training, test = SplitDataInHalf(FeatureTable, f.size) learner = orngTree.TreeLearner(training) res = orngTest.testOnData([learner], test) if make == True: learner = orngTree.TreeLearner(FeatureTable) res = orngTest.testOnData([learner], TestTable) res2 = orngTest.testOnData([learner], FeatureTable) WriteToFile("dev_tonder_olsen.txt", res2, idlist) WriteToFile("test_tonder_olsen.txt", res, idlist2) printresult() else: f = featureExtractor.FeatureExtractor(createFile=True) FeatureTable = orange.ExampleTable("table") learner, res = CrossValidation(FeatureTable, f.size, 10) guessyes = 0 guessno = 0 correctyes = 0 correctno = 0 for r in res.results: if str(r.classes[0]) == "1": prtres = "Yes" else: prtres = "No" if str(r.actualClass) == "1": prttrue = "Yes" correctyes = correctyes + 1 else: prttrue = "No" correctno = correctno + 1 #print str(r.classes[0]) + " vs correct: " + str(r.actualClass) if prtres == "No" and prttrue == "No": guessno = guessno + 1 elif prtres == "Yes" and prttrue == "Yes": guessyes = guessyes + 1 print "Guessed " + prtres + " and the correct answer was: " + prttrue #res = orngTest.leaveOneOut([learner],FeatureTable) #printresult = orngStat.CA(res, orngStat.IS(res)) #print "Yes Accuracy: " + str(float(guessyes)/float(correctyes)) #print "No Accuracy: " + str(float(guessno)/float(correctno)) printresult = orngStat.CA(res) print "Accuracy: " + str(printresult[0])
def CrossValidation(FeatureTable, n, p): """ FeatureTable = an orange ExampeTable with training data n = the size of the test data p = the number of sections you will make of the training data """ learner = None results = None best = 0 for i in range(p): start = i * n / p end = start + (n / p) testData = FeatureTable.getItems(range(start, end)) trainingData = FeatureTable.getItems(range(0, start)) for x in range(end, n): trainingData.append(FeatureTable[x]) l = orngTree.TreeLearner(trainingData) res = orngTest.testOnData([l], testData) c = 0 for r in res.results: if r.classes[0] == r.actualClass: c = c + 1 if c > best: best = c learner = l results = res return learner, results
def constructLearner(self): rand = random.Random(self.rseed) attrs = None if self.attributes: attrs = self.attributesP smallLearner = orngTree.TreeLearner() if self.preNodeInst: smallLearner.stop.minExamples = self.preNodeInstP else: smallLearner.stop.minExamples = 0 smallLearner.storeExamples = 1 smallLearner.storeNodeClassifier = 1 smallLearner.storeContingencies = 1 smallLearner.storeDistributions = 1 if self.limitDepth: smallLearner.maxDepth = self.limitDepthP learner = orngEnsemble.RandomForestLearner(base_learner=smallLearner, trees=self.trees, rand=rand, attributes=attrs) if self.preprocessor: learner = self.preprocessor.wrapLearner(learner) learner.name = self.name return learner
def __init__(self, learner=None, trees=100, attributes=None, name='Random Forest', rand=None, callback=None): """random forest learner""" self.trees = trees self.name = name self.learner = learner self.attributes = attributes self.callback = callback if rand: self.rand = rand else: self.rand = random.Random() self.rand.seed(0) self.randstate = self.rand.getstate() #original state if not learner: # tree learner assembled as suggested by Brieman (2001) smallTreeLearner = orngTree.TreeLearner(storeNodeClassifier=0, storeContingencies=0, storeDistributions=1, minExamples=5).instance() smallTreeLearner.split.discreteSplitConstructor.measure = smallTreeLearner.split.continuousSplitConstructor.measure = orange.MeasureAttribute_gini( ) smallTreeLearner.split = SplitConstructor_AttributeSubset( smallTreeLearner.split, attributes, self.rand) self.learner = smallTreeLearner
def setLearner(self): if hasattr(self, "btnApply"): self.btnApply.setFocus() if not self.limitDepth: mDepth = {} else: mDepth = {'maxDepth': self.maxDepth} self.learner = orngTree.TreeLearner( measure=self.measures[self.estim][1], reliefK=self.relK, reliefM=self.limitRef and self.relM or -1, binarization=self.bin, minExamples=self.preNodeInst and self.preNodeInstP, minSubset=self.preLeafInst and self.preLeafInstP, maxMajority=self.preNodeMaj and self.preNodeMajP / 100.0 or 1.0, sameMajorityPruning=self.postMaj, mForPruning=self.postMPruning and self.postM, storeExamples=1, **mDepth) self.learner.name = self.name if self.preprocessor: self.learner = self.preprocessor.wrapLearner(self.learner) self.send("Learner", self.learner) self.error() if self.data: try: self.classifier = self.learner(self.data) self.classifier.name = self.name except Exception, (errValue): self.error(str(errValue)) self.classifier = None
def CVByPairs(data, dimensions=None, method=None, **dic): import orngTree cv = orange.MakeRandomIndicesCV(data, 10) meter = orange.ExamplesDistanceConstructor_Euclidean(data) maxDist = 0 for i in range(100): maxDist = max(maxDist, meter(data.randomexample(), data.randomexample())) weightK = 10.0 / maxDist acc = amb = unre = 0 for fold in range(10): train = data.select(cv, fold, negate=1) test = data.select(cv, fold) pa, qid, did, cid = pade(train, dimensions, method, originalAsMeta=True, **dic) tree = orngTree.TreeLearner(pa, maxDepth=4) tacc, tamb, tunre = computeDirectionAccuracyForPairs( tree, data, meter, weightK, -1) acc += tacc amb += tamb unre += tunre return acc / 10, amb / 10, unre / 10
def __init__(self): self.last_tag_seen = '0' self.tag_visible = 'f' self.tag_x_coord = 180.0 self.tag_distance = 2000.0 self.bumping = 'f' self.data = orange.ExampleTable("analyzed_data/tag_data") self.tree = orngTree.TreeLearner(self.data, maxMajority=0.7) self.pub = rospy.Publisher('cmd_vel', Twist) self.picked_up = True self.generating_tree = False
def classify(): import orange, orngTree testData = orange.ExampleTable('data/audioTest.tab') trainData = orange.ExampleTable('data/audioTrain.tab') bayes = orange.BayesLearner(trainData) bayes.name = "bayes" tree = orngTree.TreeLearner(trainData) tree.name = "tree" classifiers = [bayes, tree] return classifiers, trainData, testData
def CVByNodes(data, dimensions=None, method=None, **dic): import orngTree cv = orange.MakeRandomIndicesCV(data, 10) for fold in range(10): train = data.select(cv, fold, negate=1) test = data.select(cv, fold) pa, qid, did, cid = pade(train, dimensions, method, originalAsMeta=True, **dic) tree = orngTree.TreeLearner(pa, maxDepth=4) mb, cc = computeAmbiguityAccuracy(tree, test, -1) amb += mb acc += cc return amb / 10, acc / 10
def btnBuildClicked(self): node = self.findCurrentNode() if not node or not len(node.examples): return try: newtree = (self.treeLearner or orngTree.TreeLearner(storeExamples = 1))(node.examples) except: return if not hasattr(newtree, "tree"): QMessageBox.critical( None, "Invalid Learner", "The learner on the input built a classifier which is not a tree.", QMessageBox.Ok) for k, v in newtree.tree.__dict__.items(): node.setattr(k, v) self.updateTree()
def makeLearner(self): # for icmi #from orangeUtils import ThresholdProbabilityLearner #import orngBayes #learner = orngBayes.BayesLearner() #learner.adjustThreshold = True #return learner #return orngEnsemble.RandomForestLearner(data) #return orngTree.TreeLearner(data) #return treefss.TreeFSS(N=7)(data) treeLearner = orngTree.TreeLearner(storeExamples=True) treeLearner.stop = orange.TreeStopCriteria_common() #treeLearner.stop.minExamples = 1 treeLearner.maxDepth = 5 #treeLearner.stop.maxMajority = 0.8 #return treeLearner return preposition.RejectInsaneExampleLearner(treeLearner)
def report_tree(self, name): filename = name + '.tsv' stream = file(filename, 'wt') # header stream.write('\t'.join(self.names) + '\n') stream.write('\t'.join(self.types) + '\n') stream.write('class\n') # rows for row in self.rows: if row[0] == 'skip': continue row += [''] * (len(self.names) - len(row)) stream.write('\t'.join(row) + '\n') stream.close() # See http://www.ailab.si/orange/doc/ofb/c_otherclass.htm try: import orange import orngTree except ImportError: sys.stderr.write( 'Install Orange from http://www.ailab.si/orange/ for a classification tree.\n' ) return None data = orange.ExampleTable(filename) tree = orngTree.TreeLearner(data, sameMajorityPruning=1, mForPruning=2) orngTree.printTxt(tree, maxDepth=4) text_tree = orngTree.dumpTree(tree) file(name + '.txt', 'wt').write(text_tree) orngTree.printDot(tree, fileName=name + '.dot', nodeShape='ellipse', leafShape='box') return text_tree
def orange_dt_rules(self): start = time.time() bad_cutoff = self.influence_cutoff(self.bad_tables) good_cutoff = self.influence_cutoff(self.good_tables) _logger.debug("cutoffs\t%f\t%f", bad_cutoff, good_cutoff) self.cost_cutoff = time.time() - start _logger.debug("creating training data") training = self.create_training(bad_cutoff, good_cutoff) #_logger.debug( "training on %d points" , len(training)) tree = orngTree.TreeLearner(training) rules = tree_to_clauses(training, tree.tree) #_logger.debug('\n'.join(map(lambda r: '\t%s' % r, rules))) # tree = Orange.classification.tree.C45Learner(training, cf=0.001) # rules = c45_to_clauses(training, tree.tree) return training, rules
def main(): """Main script""" paper_table = build_papers_table() tree = orngTree.TreeLearner(minSubset=5, sameMajorityPruning=True) learners = [tree] FOLDS = 10 results = Orange.evaluation.testing.cross_validation(learners, paper_table, folds=FOLDS, storeClassifiers=1) confusions = [] print "Learner CA Brier AUC" for i in range(len(learners)): print "%-8s %5.3f %5.3f %5.3f" % (learners[i].name, \ Orange.evaluation.scoring.CA(results)[i], Orange.evaluation.scoring.Brier_score(results)[i], Orange.evaluation.scoring.AUC(results)[i]) for k in range(0, FOLDS): indices = [ paper_table[x] for x in range(0, len(paper_table)) if results.results[x].iteration_number == k ] confusions.append( buildConfusion(indices, results.classifiers[k][i], TYPE_DIRS.keys())) confusion = buildTotalConfusions(confusions, TYPE_DIRS.keys()) printConfusion(confusion, TYPE_DIRS.keys()) printMeasures(confusion) orngTree.printTxt(results.classifiers[k][i], leafStr="%V (%M / %N)", nodeStr="(%M / %N)", leafFields=['major', 'contingency'])
def train(self, trainset): """ Trains an ensemble of tree with Adaboost.M1. """ self.n_classes = len(trainset.metadata['targets']) trainset_orange = make_orange_dataset(trainset) self.trainset_domain = trainset_orange.domain tree = orngTree.TreeLearner(max_majority=self.max_majority, max_depth=self.max_depth, min_instances=self.min_leaf_size, skip_prob=self.skip_prob) adaboost = orngEnsemble.BoostedLearner(learner=tree, t=self.n_trees, name="AdaBoost.M1") self.boosted_trees = adaboost(instances=trainset_orange)
def summary(self): sys.stdout.write("%u tests, %u passed, %u skipped, %u failed\n\n" % (self.tests, self.passed, self.skipped, self.failed)) sys.stdout.flush() name, ext = os.path.splitext(os.path.basename(sys.argv[0])) filename = name + '.tsv' stream = file(filename, 'wt') # header stream.write('\t'.join(self.names) + '\n') stream.write('\t'.join(self.types) + '\n') stream.write('class\n') # rows for row in self.rows: row += [''] * (len(self.names) - len(row)) stream.write('\t'.join(row) + '\n') stream.close() # See http://www.ailab.si/orange/doc/ofb/c_otherclass.htm try: import orange import orngTree except ImportError: sys.stderr.write( 'Install Orange from http://www.ailab.si/orange/ for a classification tree.\n' ) return data = orange.ExampleTable(filename) tree = orngTree.TreeLearner(data, sameMajorityPruning=1, mForPruning=2) orngTree.printTxt(tree, maxDepth=4) file(name + '.txt', 'wt').write(orngTree.dumpTree(tree)) orngTree.printDot(tree, fileName=name + '.dot', nodeShape='ellipse', leafShape='box')
def test_author_classification_dummy_dataset(self): train_set = numpy.array([[0.2, 0.5, 0.2, 0.2, 0.1, 10., 0], [0.2, 0.3, 0.12, 0.1, 0.1, 10., 0], [0.2, 0.2, 0.08, 0.2, 0.01, 20., 0], [0.2, 0.5, 0.1, 0.1, 0.2, 5., 0], [0.2, 0.1, 0.2, 0.2, 0.3, 20., 0], [0.7, 0.5, 0.2, 0.8, 0.3, 0.1, 1], [0.6, 0.8, 5.2, 0.2, 0.6, 0.3, 1], [0.2, 0.6, 8.2, 0.9, 0.9, 0.1, 1], [0.5, 0.9, 1.2, 0.1, 0.1, 0.2, 1], [0.9, 0.1, 0.9, 0.6, 0.3, 0.6, 1]]) attributes = ["retweets", "links", "retweeted", "replies", "mentions", "ff-ratio", "class"] table = construct_orange_table(attributes, train_set, classed=True) treeLearner = orngTree.TreeLearner() treeClassifier = treeLearner(table) example = Orange.data.Instance(table.domain, [0.2, 0.5, 0.2, 0.2, 0.1, 100, 0]) prediction = treeClassifier(example) self.assertEquals(0, prediction.value)
def learnModel(self, X, y): if numpy.unique(y).shape[0] != 2: raise ValueError("Can only operate on binary data") classes = numpy.unique(y) self.worstResponse = classes[classes != self.bestResponse][0] #We need to convert y into indices newY = self.labelsToInds(y) XY = numpy.c_[X, newY] attrList = [] for i in range(X.shape[1]): attrList.append(orange.FloatVariable("X" + str(i))) attrList.append(orange.EnumVariable("y")) attrList[-1].addValue(str(self.bestResponse)) attrList[-1].addValue(str(self.worstResponse)) self.domain = orange.Domain(attrList) eTable = orange.ExampleTable(self.domain, XY) #Weight examples preprocessor = orange.Preprocessor_addClassWeight(equalize=1) preprocessor.classWeights = [1 - self.weight, self.weight] eTable, weightID = preprocessor(eTable) eTable.domain.addmeta(weightID, orange.FloatVariable("w")) tree = orngTree.TreeLearner(mForPruning=self.m, measure="gainRatio", minExamples=self.minSplit, maxDepth=self.maxDepth).instance() self.learner = orngEnsemble.RandomForestLearner( learner=tree, trees=self.numTrees, attributes=numpy.round(X.shape[1] * self.featureSize)) self.classifier = self.learner(eTable, weightID)
def setLearner(self): learner = orngTree.TreeLearner(measure="retis", binarization=self.Bin, mForPruning=self.PostMPCheck and self.PostMPVal, minExamples=self.MinNodeCheck and self.MinNodeVal, storeExamples=1) if self.preprocessor: learner = self.preprocessor.wrapLearner(learner) learner.name = self.Name self.send("Learner", learner) self.error() classifier = None if self.data: try: classifier = learner(self.data) classifier.name = self.Name except orange.KernelException, (errValue): self.error(str(errValue)) classifier = None
def setLearner(self): learner = orngTree.TreeLearner(mesure="retis", binarization=self.Bin, mForPruning=self.PostMPCheck and self.PostMPVal, minExamples=self.MinNodeCheck and self.MinNodeVal, storeExamples=1) learner.name = self.Name self.send("Learner", learner) self.error() if not self.data: return try: classifier = learner(self.data) classifier.name = self.Name self.send("Regressor", classifier) self.send("Regression Tree", classifier) except orange.KernelException, (errValue): self.error(str(errValue)) self.send("Regressor", None) self.send("Regression Tree", None)
def learnModel(self, X, y): if numpy.unique(y).shape[0] != 2: raise ValueError("Can only operate on binary data") classes = numpy.unique(y) self.worstResponse = classes[classes != self.bestResponse][0] #We need to convert y into indices newY = self.labelsToInds(y) XY = numpy.c_[X, newY] attrList = [] for i in range(X.shape[1]): attrList.append(orange.FloatVariable("X" + str(i))) attrList.append(orange.EnumVariable("y")) attrList[-1].addValue(str(self.bestResponse)) attrList[-1].addValue(str(self.worstResponse)) self.domain = orange.Domain(attrList) eTable = orange.ExampleTable(self.domain, XY) #Weight examples and equalise #Equalizing computes such weights that the weighted number of examples #in each class is equivalent. preprocessor = orange.Preprocessor_addClassWeight(equalize=1) preprocessor.classWeights = [1 - self.weight, self.weight] eTable, weightID = preprocessor(eTable) eTable.domain.addmeta(weightID, orange.FloatVariable("w")) self.learner = orngTree.TreeLearner(m_pruning=self.m, measure="gainRatio") self.learner.max_depth = self.maxDepth self.learner.stop = orange.TreeStopCriteria_common() self.learner.stop.min_instances = self.minSplit self.classifier = self.learner(eTable, weightID)
# Category: modelling, evaluation # Uses: housing # Classes: orngTest.crossValidation, orngTree.TreeLearner, orange.kNNLearner, orngRegression.LinearRegressionLearner # Referenced: regression.htm import orange import orngRegression import orngTree import orngStat, orngTest data = orange.ExampleTable("housing") # definition of learners (regressors) lr = orngRegression.LinearRegressionLearner(name="lr") rt = orngTree.TreeLearner(measure="retis", mForPruning=2, minExamples=20, name="rt") maj = orange.MajorityLearner(name="maj") knn = orange.kNNLearner(k=10, name="knn") learners = [maj, lr, rt, knn] # evaluation and reporting of scores results = orngTest.crossValidation(learners, data, folds=10) scores = [("MSE", orngStat.MSE), ("RMSE", orngStat.RMSE), ("MAE", orngStat.MAE), ("RSE", orngStat.RSE), ("RRSE", orngStat.RRSE), ("RAE", orngStat.RAE), ("R2", orngStat.R2)] print "Learner " + "".join(["%-7s" % s[0] for s in scores]) for i in range(len(learners)): print "%-8s " % learners[i].name + "".join( ["%6.3f " % s[1](results)[i] for s in scores])
# Description: Builds a regression tree and prints it out # Category: modelling # Uses: housing # Classes: orngTree.TreeLearner # Referenced: regression.htm import orange, orngTree data = orange.ExampleTable("../datasets/housing.tab") rt = orngTree.TreeLearner(data, measure="retis", mForPruning=2, minExamples=20) orngTree.printTxt(rt, leafStr="%V %I")
appl = QApplication(sys.argv) ow = OWLearningCurveC() ow.show() l1 = orange.BayesLearner() l1.name = 'Naive Bayes' ow.learner(l1, 1) data = orange.ExampleTable('iris.tab') ow.dataset(data) l2 = orange.BayesLearner() l2.name = 'Naive Bayes (m=10)' l2.estimatorConstructor = orange.ProbabilityEstimatorConstructor_m(m=10) l2.conditionalEstimatorConstructor = orange.ConditionalProbabilityEstimatorConstructor_ByRows( estimatorConstructor=orange.ProbabilityEstimatorConstructor_m(m=10)) l3 = orange.kNNLearner(name="k-NN") ow.learner(l3, 3) import orngTree l4 = orngTree.TreeLearner(minSubset=2) l4.name = "Decision Tree" ow.learner(l4, 4) # ow.learner(None, 1) # ow.learner(None, 2) # ow.learner(None, 4) appl.exec_()
#!/usr/bin/env python # # See also: # http://www.ailab.si/orange/doc/ofb/c_otherclass.htm import os.path import sys import orange import orngTree for arg in sys.argv[1:]: name, ext = os.path.splitext(arg) data = orange.ExampleTable(arg) tree = orngTree.TreeLearner(data, sameMajorityPruning=1, mForPruning=2) orngTree.printTxt(tree) file(name + '.txt', 'wt').write(orngTree.dumpTree(tree) + '\n') orngTree.printDot(tree, fileName=name + '.dot', nodeShape='ellipse', leafShape='box')
def main(): version = "%prog version 0.1" usage = "usage: %prog [options] [input] [options [classification]]" desc = "QUICK START: To extract data from a trial, 'cd' to the \ trial's directory and type: 'sqk --classify'. To extract data \ from one channel of the trial (ch 1 in this case), type: \ 'sqk --classify --channel=1'." # Parse command line options. parser = optparse.OptionParser(usage, version=version, description=desc) parser.add_option("-C", "--classify", dest="classify", action="store_true", default=False, help="Classify the trial. IMPORTANT: Trial folder must " \ "be the current directory.") parser.add_option("-m", "--channel", metavar="<CH>", dest="channel", action="store", type="int", default=0, help="Specify which channel to extract data from. " \ "Default (%default) extracts data from both " \ "channels. Must choose 0 (both channels), 1, or 2.") parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="Parses a log file if it exists and adds time and" \ " duration information to the data file.") parser.add_option("-T", "--traindata", metavar="<DATA_FILE>", dest="trainData", action="store", default=os.path.join(TRAIN_PATH, 'traindata'), help="Specify training data set. Default is %default") parser.add_option("-L", "--learner", metavar="<TYPE>", dest="learner", action="store", default="svm", help="Specify the classifier algorithm. Options include:" \ " 'bayes' (Naive Bayes), 'knn' (k-Nearest Neighbor)," \ " 'svm' (SVM), 'forest' (random forest). " \ "Default is %default.") parser.add_option("-f", "--file", metavar="<AUDIO_FILE>", dest="audio", action="store", help="Extract features and classify audio file (wav)") parser.add_option("-p", "--path", metavar="<PATH>", dest="path", action="store", help="Extract features and classify all files in a " \ "directory. To extract from current directory: " \ "'usv.py -p .' ") parser.add_option("-r", "--rate", metavar="<SAMPLE_RATE>", dest="sampleRate", action="store", default="11025", help="Specify the sample rate of input files. Default is " \ "%default (Hz).") parser.add_option("-t", "--train", metavar="<CLASS>", dest="exampleClass", action="store", type='string', help="Label the training example(s).") parser.add_option("-d", "--data", metavar="<DATA_FILE>", dest="data", action="store", default="data.tab", help="Write to data file (.tab format). Default is " \ "'%default' or 'traindata.tab' for training data.") parser.add_option("-S", "--seg-resamp", dest="segment", action="store_true", default=False, help="Resample to 11025 Hz and split into multiple files " \ "based on silence. IMPORTANT: Trial folder must " \ "be the current directory.") (opts, args) = parser.parse_args() if opts.channel and not (opts.classify or opts.segment): parser.error("'--channel' option requires '--classify' option'") if opts.log and not opts.classify: parser.error("'--log' option requires '--classify' option'") # Open train data file or create it if it doesn't exist. if opts.exampleClass and opts.data == "data.tab": opts.data = os.path.join(TRAIN_PATH, 'traindata.tab') if opts.audio or opts.path: if not opts.segment: print 'Opening %r. . .' % (opts.data) data = open(opts.data, "a+") elif opts.segment: print "Resampling and segmenting trial. . ." elif opts.classify: print "Classifying trial. . ." else: parser.error('No input file or path specified.') # If user specifies an audio file (-f AUDIO_FILE) if opts.audio: file_name, ext = os.path.splitext(opts.audio) # Add MFCC 1-12 to data. if not opts.segment: write_features(opts.audio, opts.sampleRate, data) # If classification is specified, write to data. if opts.exampleClass: data.write(opts.exampleClass.lower() + "\n") print "Classified %r as %r." % (opts.audio, opts.exampleClass.lower()) # Else if user chooses to segment file (-S) elif opts.segment: print "Resampling and segmenting %s. . ." % (opts.audio) if opts.channel == 0: run_commands( seg_resamp(opts.audio, int(opts.sampleRate), outfile=file_name + '_call.wav', directory=file_name + "_ch1_2", ch1=True, ch2=True)) elif opts.channel == 1: run_commands( seg_resamp(opts.audio, int(opts.sampleRate), outfile=file_name + '_ch1_.wav', directory=file_name + "_ch1", ch1=True, ch2=False)) elif opts.channel == 2: run_commands( seg_resamp(opts.audio, int(opts.sampleRate), outfile=file_name + '_ch2_.wav', directory=file_name + "_ch2", ch1=False, ch2=True)) print "Wrote to './%s'." % (file_name + "_calls") else: print "Invalid data for %r. Skipping. . ." % opts.audio data.write('\n') # Else if user specifies path (-p PATH) elif opts.path: # Read all wav files in specified path try: for root, dirs, files in os.walk(opts.path): for basename in files: if fnmatch.fnmatch(basename, "*.[wW][aA][vV]"): audiofile = os.path.join(root, basename) # Skip small files if os.path.getsize(audiofile) < 100: continue file_name, ext = os.path.splitext(audiofile) # Add MFCC 1-12 to data. if not opts.segment: write_features(audiofile, opts.sampleRate, data) # Write filename data.write(str(os.path.basename(audiofile)) + "\t") # If classification is specified, write to file. if opts.exampleClass: data.write(opts.exampleClass.lower() + "\n") print "Classified %r as %r." % ( audiofile, opts.exampleClass.lower()) # If user specifies resample and segment elif opts.segment: print "Resampling and segmenting %r. . ." % ( audiofile) if opts.channel == 0: run_commands( seg_resamp( audiofile, int(opts.sampleRate), outfile=os.path.basename(file_name) + '_call.wav', directory=os.path.basename(file_name) + "_ch1_2", ch1=True, ch2=True)) elif opts.channel == 1: run_commands( seg_resamp( audiofile, int(opts.sampleRate), outfile=os.path.basename(file_name) + '_ch1_.wav', directory=os.path.basename(file_name) + "_ch1", ch1=True, ch2=False)) elif opts.channel == 2: run_commands( seg_resamp( audiofile, int(opts.sampleRate), outfile=os.path.basename(file_name) + '_ch2_.wav', directory=os.path.basename(file_name) + "_ch2", ch1=False, ch2=True)) else: data.write('\n') except (FloatingPointError, IOError): print "An error occurred. Skipping %. . .r" % audiofile # Else if user chooses to segment and resample the trial (current dir) elif opts.segment: for audiofile in glob(os.path.join('./', "*.[wW][aA][vV]")): file_name, ext = os.path.splitext(audiofile) print "Resampling and segmenting %r. . ." % (file_name) if opts.channel == 0: run_commands( seg_resamp(audiofile, int(opts.sampleRate), outfile=file_name + '_call.wav', directory=file_name + "_ch1_2", ch1=True, ch2=True)) elif opts.channel == 1: run_commands( seg_resamp(audiofile, int(opts.sampleRate), outfile=file_name + '_ch1_.wav', directory=file_name + "_ch1", ch1=True, ch2=False)) elif opts.channel == 2: run_commands( seg_resamp(audiofile, int(opts.sampleRate), outfile=file_name + '_ch2_.wav', directory=file_name + "_ch2", ch1=False, ch2=True)) # Else if user chooses to classify the trial elif opts.classify: # TODO: Should not be able to classify if no data files in folder try: traindata = orange.ExampleTable(opts.trainData) except SystemError: print "Training data not found." sys.exit(1) # The logger if opts.log: logs = glob(os.path.join(os.getcwd(), "*.[lL][oO][gG]")) if len(logs) > 1: print "ERROR: Multiple log files." sys.exit(1) log = usv.avisoftlog.RecLog(open(logs[0], 'r')) # The classifier print "Constructing %s classifier \ (may take several minutes). . ." % (opts.learner) if opts.learner.lower() == "bayes": classifier = orange.BayesLearner(traindata) classifier.name = "naive_bayes" elif opts.learner.lower() == "knn": classifier = Orange.classification.knn.kNNLearner(traindata) classifier.name = "kNN" elif opts.learner.lower() == "svm": svm = SVMLearner(name="SVM", kernel_type=kernels.RBF, C=128, gamma=2, nu=0.1) classifier = svm(traindata) classifier.name = "SVM" elif opts.learner.lower() == "tree": classifier = orngTree.TreeLearner(traindata) classifier.name = "tree" elif opts.learner.lower() == "forest": classifier = Orange.ensemble.forest.RandomForestLearner(traindata) classifier.name = "random_forest" # Create data summary file if opts.channel == 0: datasummary_name = os.path.splitext(opts.data)[0] + "_ch1_2.tab" elif opts.channel == 1: datasummary_name = os.path.splitext(opts.data)[0] + "_ch1.tab" elif opts.channel == 2: datasummary_name = os.path.splitext(opts.data)[0] + "_ch2.tab" if os.path.exists(datasummary_name): print "Data file %r already exists." % (datasummary_name) print "Exiting . . ." sys.exit(1) else: summary = open(datasummary_name, "a+") # Write metadata summary.write("# data = %s\n" % (datasummary_name)) summary.write("# channel = %d\n" % (opts.channel)) summary.write("# sample_rate = %s\n" % (opts.sampleRate)) summary.write("# classifier = %s\n" % (classifier.name)) # Write header summary.write("FILE\t") for i in range(len(traindata.domain.classVar.values)): summary.write(traindata.domain.classVar.values[i].upper() + "\t") if opts.log: summary.write("start: " + str(log.start.time) + "\t") summary.write("Duration" + "\t") summary.write("\n") totals = [0] * len(traindata.domain.classVar.values) proportions = [0.0] * len(totals) for root, dirs, files in os.walk(os.getcwd()): # For each file's directory in this trial for dir in dirs: data = open(os.path.join(dir, dir + '.tab'), 'w+') if opts.channel == 0: calls = glob(os.path.join(dir, "*ch1_2*.[wW][aA][vV]")) elif opts.channel == 1: calls = glob(os.path.join(dir, "*ch1*.[wW][aA][vV]")) elif opts.channel == 2: calls = glob(os.path.join(dir, "*ch2*.[wW][aA][vV]")) # For each call for c in calls: # Skip small files if os.path.getsize(c) < 100: print "Skipping %s (not enough data)" % c continue # Write feature data write_features(c, opts.sampleRate, data) data.close() # Ensures that data is saved # Write filenames and classifications data = open(os.path.join(dir, dir + '.tab'), 'a+') datatable = orange.ExampleTable( os.path.join(dir, dir + '.tab')) classification = classifier(datatable[calls.index(c)]) data.write(str(os.path.basename(c)) + '\t') data.write(str(classification)) data.write('\n') try: data.close() except UnboundLocalError: parser.error( 'No directories in this folder. Did you remember to segment the files?' ) # Write class count data to summary table for dir in dirs: if opts.channel == 0: data_files = glob(os.path.join(dir, "*ch1_2.tab")) elif opts.channel == 1: data_files = glob(os.path.join(dir, "*ch1.tab")) elif opts.channel == 2: data_files = glob(os.path.join(dir, "*ch2.tab")) for c in data_files: if os.path.getsize(c) == 0: continue file_name, ext = os.path.splitext(os.path.basename(c)) summary.write(file_name + '\t') callsdata = orange.ExampleTable(os.path.join("./", c)) # Vector of class counts counts = [0] * len(callsdata.domain.classVar.values) for e in callsdata: counts[int(e.getclass())] += 1 # Write counts for i in range(len(callsdata.domain.classVar.values)): summary.write(str(counts[i]) + "\t") totals[i] += counts[i] # Write log data if opts.log: tmp = str(os.path.basename(dir)).lower() entry = tmp[0:tmp.find("_")] + ".wav" summary.write(str(log.getevent(entry).time) + "\t") summary.write(log.getevent(entry).duration + "\t") log.close() summary.write('\n') # Write totals. Exclude BGNOISE. summary.write("TOTAL" + "\t\t") for i in range(1, len(totals)): summary.write(str(totals[i]) + "\t") if opts.log: summary.write("end: " + str(log.end.time) + "\t") summary.write("\n") # Write proportions. Exclude BGNOISE. summary.write("P" + "\t\t") for i in range(1, len(proportions)): try: proportions[i] = float( totals[i]) / float(sum(totals) - totals[0]) except ZeroDivisionError: proportions[i] = 0.0 summary.write("%.4f\t" % (proportions[i])) summary.write("\n") summary.close() # Open data file when finished subprocess.call('open %s' % (datasummary_name), shell=True) else: data.write("\n") if not opts.segment: data.close() print "Success!"
def makeLearner(self): treeLearner = orngTree.TreeLearner(storeExamples=True) return treeLearner
# Description: Demostration of use of cross-validation as provided in orngEval module # Category: evaluation # Uses: voting.tab # Classes: orngTest.crossValidation # Referenced: c_performance.htm import orange, orngTest, orngStat, orngTree # set up the learners bayes = orange.BayesLearner() tree = orngTree.TreeLearner(mForPruning=2) bayes.name = "bayes" tree.name = "tree" learners = [bayes, tree] # compute accuracies on data data = orange.ExampleTable("voting") results = orngTest.crossValidation(learners, data, folds=10) # output the results print "Learner CA IS Brier AUC" for i in range(len(learners)): print "%-8s %5.3f %5.3f %5.3f %5.3f" % (learners[i].name, \ orngStat.CA(results)[i], orngStat.IS(results)[i], orngStat.BrierScore(results)[i], orngStat.AUC(results)[i])
if __name__ == "__main__": a = QApplication(sys.argv) ow = OWPredictions() ow.show() import orngTree dataset = orange.ExampleTable('../../doc/datasets/iris.tab') # dataset = orange.ExampleTable('../../doc/datasets/auto-mpg.tab') ind = orange.MakeRandomIndices2(p0=0.5)(dataset) data = dataset.select(ind, 0) test = dataset.select(ind, 1) testnoclass = orange.ExampleTable( orange.Domain(test.domain.attributes, False), test) tree = orngTree.TreeLearner(data) tree.name = "tree" maj = orange.MajorityLearner(data) maj.name = "maj" knn = orange.kNNLearner(data, k=10) knn.name = "knn" if 0: # data set only ow.setData(test) if 0: # two predictors, test data with class ow.setPredictor(maj, 1) ow.setPredictor(tree, 2) ow.setData(test) if 0: # two predictors, test data with no class ow.setPredictor(maj, 1) ow.setPredictor(tree, 2)