def __call__(self, data, weight_id=0, **kwds): import Orange.evaluation.testing, Orange.evaluation.scoring, statc self.__dict__.update(kwds) if self.remove_threshold < self.add_threshold: raise ValueError( "'remove_threshold' should be larger or equal to 'add_threshold'" ) classVar = data.domain.classVar indices = Orange.core.MakeRandomIndicesCV(data, folds=getattr( self, "folds", 10)) domain = Orange.data.Domain([], classVar) res = Orange.evaluation.testing.test_with_indices([self.learner], Orange.data.Table( domain, data), indices) oldStat = self.stat(res)[0] oldStats = [ self.stat(x)[0] for x in Orange.evaluation.scoring.split_by_iterations(res) ] print ".", oldStat, domain stop = False while not stop: stop = True if len(domain.attributes) >= 2: bestStat = None for attr in domain.attributes: newdomain = Orange.data.Domain( filter(lambda x: x != attr, domain.attributes), classVar) res = Orange.evaluation.testing.test_with_indices( [self.learner], (Orange.data.Table(newdomain, data), weight_id), indices) newStat = self.stat(res)[0] newStats = [ self.stat(x)[0] for x in Orange.evaluation.scoring.split_by_iterations(res) ] print "-", newStat, newdomain ## If stat has increased (ie newStat is better than bestStat) if not bestStat or cmp(newStat, bestStat) == self.statsign: if cmp(newStat, oldStat) == self.statsign: bestStat, bestStats, bestAttr = newStat, newStats, attr elif statc.wilcoxont( oldStats, newStats)[1] > self.remove_threshold: bestStat, bestAttr, bestStats = newStat, newStats, attr if bestStat: domain = Orange.data.Domain( filter(lambda x: x != bestAttr, domain.attributes), classVar) oldStat, oldStats = bestStat, bestStats stop = False print "removed", bestAttr.name bestStat, bestAttr = oldStat, None for attr in data.domain.attributes: if not attr in domain.attributes: newdomain = Orange.data.Domain(domain.attributes + [attr], classVar) res = Orange.evaluation.testing.test_with_indices( [self.learner], (Orange.data.Table(newdomain, data), weight_id), indices) newStat = self.stat(res)[0] newStats = [ self.stat(x)[0] for x in Orange.evaluation.scoring.split_by_iterations(res) ] print "+", newStat, newdomain ## If stat has increased (ie newStat is better than bestStat) if cmp(newStat, bestStat) == self.statsign and statc.wilcoxont( oldStats, newStats)[1] < self.add_threshold: bestStat, bestStats, bestAttr = newStat, newStats, attr if bestAttr: domain = Orange.data.Domain(domain.attributes + [bestAttr], classVar) oldStat, oldStats = bestStat, bestStats stop = False print "added", bestAttr.name return self.learner(Orange.data.Table(domain, data), weight_id)
def __call__(self, data, weight_id = 0, **kwds): import Orange.evaluation.testing, Orange.evaluation.scoring, statc self.__dict__.update(kwds) if self.remove_threshold < self.add_threshold: raise ValueError("'remove_threshold' should be larger or equal to 'add_threshold'") classVar = data.domain.classVar indices = Orange.core.MakeRandomIndicesCV(data, folds = getattr(self, "folds", 10)) domain = Orange.data.Domain([], classVar) res = Orange.evaluation.testing.test_with_indices([self.learner], Orange.data.Table(domain, data), indices) oldStat = self.stat(res)[0] oldStats = [self.stat(x)[0] for x in Orange.evaluation.scoring.split_by_iterations(res)] print ".", oldStat, domain stop = False while not stop: stop = True if len(domain.attributes)>=2: bestStat = None for attr in domain.attributes: newdomain = Orange.data.Domain(filter(lambda x: x!=attr, domain.attributes), classVar) res = Orange.evaluation.testing.test_with_indices([self.learner], (Orange.data.Table(newdomain, data), weight_id), indices) newStat = self.stat(res)[0] newStats = [self.stat(x)[0] for x in Orange.evaluation.scoring.split_by_iterations(res)] print "-", newStat, newdomain ## If stat has increased (ie newStat is better than bestStat) if not bestStat or cmp(newStat, bestStat) == self.statsign: if cmp(newStat, oldStat) == self.statsign: bestStat, bestStats, bestAttr = newStat, newStats, attr elif statc.wilcoxont(oldStats, newStats)[1] > self.remove_threshold: bestStat, bestAttr, bestStats = newStat, newStats, attr if bestStat: domain = Orange.data.Domain(filter(lambda x: x!=bestAttr, domain.attributes), classVar) oldStat, oldStats = bestStat, bestStats stop = False print "removed", bestAttr.name bestStat, bestAttr = oldStat, None for attr in data.domain.attributes: if not attr in domain.attributes: newdomain = Orange.data.Domain(domain.attributes + [attr], classVar) res = Orange.evaluation.testing.test_with_indices([self.learner], (Orange.data.Table(newdomain, data), weight_id), indices) newStat = self.stat(res)[0] newStats = [self.stat(x)[0] for x in Orange.evaluation.scoring.split_by_iterations(res)] print "+", newStat, newdomain ## If stat has increased (ie newStat is better than bestStat) if cmp(newStat, bestStat) == self.statsign and statc.wilcoxont(oldStats, newStats)[1] < self.add_threshold: bestStat, bestStats, bestAttr = newStat, newStats, attr if bestAttr: domain = Orange.data.Domain(domain.attributes + [bestAttr], classVar) oldStat, oldStats = bestStat, bestStats stop = False print "added", bestAttr.name return self.learner(Orange.data.Table(domain, data), weight_id)
def WilcoxonRankTest(accLearner1, accLearner2): """ The input is two list with the value pairs to be compared! Single sided Wilcoxon rank sum test. See critical values: http://www.euronet.nl/users/warnar/demostatistiek/tables/WILCOXONTABEL.htm http://web.anglia.ac.uk/numbers/biostatistics/wilcoxon/local_folder/critical_values.html """ # Learner 1 is the most accurate diffPlus = [] # Learner 2 is the most accurate diffMinus = [] for idx in range(len(accLearner2)): diff = accLearner1[idx] - accLearner2[idx] if diff > 0: diffPlus.append(abs(diff)) elif diff < 0: diffMinus.append(abs(diff)) else: diffPlus.append(abs(diff)) diffMinus.append(abs(diff)) diffPlus.sort() diffMinus.sort() # Rank the differences according to absolute values # R is a dictionary indexed by the rank number and with the values +, - or +/- # indicating which learner the rank number will be assigned to # The greater the diff the greater the rank idx R = {} for idx in range(len(accLearner1)): # Get the smallest value in each diff list (small diff -> small idx) try: diffPlusMin = diffPlus[0] except: diffPlusMin = 10000 # No more diffPlus elements, always take diffMinus try: diffMinusMin = diffMinus[0] except: diffMinusMin = 10000 if diffPlusMin < diffMinusMin: if len(diffPlus) > 0: min = diffPlus.pop(0) R[str(idx)] = "+" elif diffPlusMin == diffMinusMin: if len(diffPlus) > 0: min = diffPlus.pop(0) if len(diffMinus) > 0: min = diffMinus.pop(0) R[str(idx)] = "+/-" else: if len(diffMinus) > 0: min = diffMinus.pop(0) R[str(idx)] = "-" # Get rank sums for the two learners - The greater the sum, the more accurate the learner Rplus = 0 Rminus = 0 for key, value in R.iteritems(): if value == "+": Rplus = Rplus + int(key) elif value == "-": Rminus = Rminus + int(key) elif value == "+/-": Rplus = Rplus + (1.0 / 2) * int(key) Rminus = Rminus + (1.0 / 2) * int(key) Rlist = [Rplus, Rminus] # Does not work!! #print min(Rlist) Rlist.sort() # ***** Already in Orange - don't use the above ************* #T = Rlist.pop(0) T = statc.wilcoxont(accLearner1, accLearner2)[0] N = len(R) print "Rank sum of learner 1" print Rplus print "Rank sum of learner 2" print Rminus if Rplus < Rminus: print "The hypothesis is that learner 2 is the most accurate" else: print "The hypothesis is that learner 1 is the most accurate" info = "If the number of data sets (N) is equal to 16 (our regression suite):\n" info += "N " + str(N) + "\n" info += "If T < 35 there is a 10% chance that the hypothesis is not true\n" info += "If T < 29 there is a 5% chance that the hypothesis is not true\n" info += "T " + str(T) + "\n" info += "If the number of data sets (N) is equal to 17 (our classification suite):\n" info += "N " + str(N) + "\n" info += "If T < 41 there is a 10% chance that the hypothesis is not true\n" info += "If T < 34 there is a 5% chance that the hypothesis is not true\n" info += "T " + str(T) + "\n" # If N > 20 #z = (T - (1.0/4)*N*(N+1))/math.sqrt((1.0/24)*N*(N+1)*(2*N+1)) #print z print info #Return the index of the best LEarner if Rplus < Rminus: return (1, info) else: return (0, info)
def __call__(self, examples, weightID=0, **kwds): import orngTest, orngStat, statc self.__dict__.update(kwds) if self.removeThreshold < self.addThreshold: raise "'removeThreshold' should be larger or equal to 'addThreshold'" classVar = examples.domain.classVar indices = orange.MakeRandomIndicesCV(examples, folds=getattr(self, "folds", 10)) domain = orange.Domain([], classVar) res = orngTest.testWithIndices([self.learner], orange.ExampleTable(domain, examples), indices) oldStat = self.stat(res)[0] oldStats = [self.stat(x)[0] for x in orngStat.splitByIterations(res)] print ".", oldStat, domain stop = False while not stop: stop = True if len(domain.attributes) >= 2: bestStat = None for attr in domain.attributes: newdomain = orange.Domain( filter(lambda x: x != attr, domain.attributes), classVar) res = orngTest.testWithIndices( [self.learner], (orange.ExampleTable(newdomain, examples), weightID), indices) newStat = self.stat(res)[0] newStats = [ self.stat(x)[0] for x in orngStat.splitByIterations(res) ] print "-", newStat, newdomain ## If stat has increased (ie newStat is better than bestStat) if not bestStat or cmp(newStat, bestStat) == self.statsign: if cmp(newStat, oldStat) == self.statsign: bestStat, bestStats, bestAttr = newStat, newStats, attr elif statc.wilcoxont( oldStats, newStats)[1] > self.removeThreshold: bestStat, bestAttr, bestStats = newStat, newStats, attr if bestStat: domain = orange.Domain( filter(lambda x: x != bestAttr, domain.attributes), classVar) oldStat, oldStats = bestStat, bestStats stop = False print "removed", bestAttr.name bestStat, bestAttr = oldStat, None for attr in examples.domain.attributes: if not attr in domain.attributes: newdomain = orange.Domain(domain.attributes + [attr], classVar) res = orngTest.testWithIndices( [self.learner], (orange.ExampleTable(newdomain, examples), weightID), indices) newStat = self.stat(res)[0] newStats = [ self.stat(x)[0] for x in orngStat.splitByIterations(res) ] print "+", newStat, newdomain ## If stat has increased (ie newStat is better than bestStat) if cmp(newStat, bestStat) == self.statsign and statc.wilcoxont( oldStats, newStats)[1] < self.addThreshold: bestStat, bestStats, bestAttr = newStat, newStats, attr if bestAttr: domain = orange.Domain(domain.attributes + [bestAttr], classVar) oldStat, oldStats = bestStat, bestStats stop = False print "added", bestAttr.name return self.learner(orange.ExampleTable(domain, examples), weightID)
def WilcoxonRankTest(accLearner1, accLearner2): """ The input is two list with the value pairs to be compared! Single sided Wilcoxon rank sum test. See critical values: http://www.euronet.nl/users/warnar/demostatistiek/tables/WILCOXONTABEL.htm http://web.anglia.ac.uk/numbers/biostatistics/wilcoxon/local_folder/critical_values.html """ # Learner 1 is the most accurate diffPlus = [] # Learner 2 is the most accurate diffMinus = [] for idx in range(len(accLearner2)): diff = accLearner1[idx]-accLearner2[idx] if diff > 0: diffPlus.append(abs(diff)) elif diff < 0: diffMinus.append(abs(diff)) else: diffPlus.append(abs(diff)) diffMinus.append(abs(diff)) diffPlus.sort() diffMinus.sort() # Rank the differences according to absolute values # R is a dictionary indexed by the rank number and with the values +, - or +/- # indicating which learner the rank number will be assigned to # The greater the diff the greater the rank idx R = {} for idx in range(len(accLearner1)): # Get the smallest value in each diff list (small diff -> small idx) try: diffPlusMin = diffPlus[0] except: diffPlusMin = 10000 # No more diffPlus elements, always take diffMinus try: diffMinusMin = diffMinus[0] except: diffMinusMin = 10000 if diffPlusMin < diffMinusMin: if len(diffPlus) > 0: min = diffPlus.pop(0) R[str(idx)] = "+" elif diffPlusMin == diffMinusMin: if len(diffPlus) > 0: min = diffPlus.pop(0) if len(diffMinus) > 0: min = diffMinus.pop(0) R[str(idx)] = "+/-" else: if len(diffMinus) > 0: min = diffMinus.pop(0) R[str(idx)] = "-" # Get rank sums for the two learners - The greater the sum, the more accurate the learner Rplus = 0 Rminus = 0 for key, value in R.iteritems(): if value == "+": Rplus = Rplus + int(key) elif value == "-": Rminus = Rminus + int(key) elif value == "+/-": Rplus = Rplus + (1.0/2)*int(key) Rminus = Rminus + (1.0/2)*int(key) Rlist = [Rplus, Rminus] # Does not work!! #print min(Rlist) Rlist.sort() # ***** Already in Orange - don't use the above ************* #T = Rlist.pop(0) T = statc.wilcoxont(accLearner1, accLearner2)[0] N = len(R) print "Rank sum of learner 1" print Rplus print "Rank sum of learner 2" print Rminus if Rplus < Rminus: print "The hypothesis is that learner 2 is the most accurate" else: print "The hypothesis is that learner 1 is the most accurate" info = "If the number of data sets (N) is equal to 16 (our regression suite):\n" info += "N " + str(N) +"\n" info += "If T < 35 there is a 10% chance that the hypothesis is not true\n" info += "If T < 29 there is a 5% chance that the hypothesis is not true\n" info += "T " + str(T) + "\n" info += "If the number of data sets (N) is equal to 17 (our classification suite):\n" info += "N " + str(N) +"\n" info += "If T < 41 there is a 10% chance that the hypothesis is not true\n" info += "If T < 34 there is a 5% chance that the hypothesis is not true\n" info += "T " + str(T) + "\n" # If N > 20 #z = (T - (1.0/4)*N*(N+1))/math.sqrt((1.0/24)*N*(N+1)*(2*N+1)) #print z print info #Return the index of the best LEarner if Rplus < Rminus: return (1, info) else: return (0, info)