def scatter(r1, r2, statistic="roc", x1Label="", x2Label="", fileName=None, **args): """ a scatter plot for comparing the performance of two classifiers :Parameters: - `r1, r2` - both are either a list of Result classes, or a list of success rates / ROC scores - `statistic` - which measure of classifier success to plot values : 'roc', 'successRate', 'balancedSuccessRate' in order to specify parts of the roc curve you can use something like: 'roc50' or 'roc0.1' :Keywords: - `title` - the title of the plot """ if len(r1) != len(r2): print "unequal lengths for r1 and r2" if type(r1) != type({}): raise ValueError, "Cannot handle unequal length when it is not a dict" keys1 = r1.keys() keys2 = r2.keys() common = misc.intersect(keys1, keys2) r1new = {} r2new = {} for key in common: r1new[key] = r1[key] r2new[key] = r2[key] r1 = r1new r2 = r2new if type(r1) == type({}) and type(r2) == type({}): I = r1.keys() else: I = range(len(r1)) if r1[I[0]].__class__.__name__ == "Results" or r1[I[0]].__class__.__name__ == "Container": p1 = misc.extractAttribute(r1, statistic) p2 = misc.extractAttribute(r2, statistic) else: p1 = r1 p2 = r2 if type(p1) == type({}): p1 = p1.values() p2 = p2.values() from matplotlib import pylab x = numpy.arange(0, 1, 0.01) pylab.plot(p1, p2, "bo", x, x, "-k") pylab.xlabel(x1Label, fontsize=18) pylab.ylabel(x2Label, fontsize=18) if "title" in args: pylab.title(args["title"], fontsize=18) pylab.show() if fileName is not None: pylab.savefig(fileName) pylab.close()
def extractNumFeatures(resultsFileName) : r = myio.load(resultsFileName) numFeatures = {} if type(r) == type({}) : info = misc.extractAttribute(r, 'foldInfo') for key in info : numFeat = [] for lines in info[key] : for line in lines.split('\n') : if line.find('number of features') == 0 : numFeat.append(float(line.split(':')[1])) numFeatures[key] = numpy.average(numFeat) return numFeatures
def extractNumFeatures(resultsFileName): r = myio.load(resultsFileName) numFeatures = {} if type(r) == type({}): info = misc.extractAttribute(r, 'foldInfo') for key in info: numFeat = [] for lines in info[key]: for line in lines.split('\n'): if line.find('number of features') == 0: numFeat.append(float(line.split(':')[1])) numFeatures[key] = numpy.average(numFeat) return numFeatures
def significance(r1, r2, statistic = 'roc') : """ report the statistical significance of the difference in error rates of a series of classification results of two classifiers using the Wilcoxon signed rank test. Returns: pvalue, (median1, median2) where: pvalue - the pvalue of the two sided Wilcoxon signed rank test; to get the pvalue of a one sided test divide the pvalue by two. (median1, median2) - the median of the statistics of the inputs r1 and r2. :Parameters: - `r1, r2` - both are either a list of Result classes, or a list of success rates - `statistic` - which measure of classifier success to plot values : 'roc', 'successRate', 'balancedSuccessRate' in order to specify parts of the roc curve you can use something like: 'roc50' or 'roc0.1' """ if type(r1) != type(r2) : raise ValueError, 'r1 and r2 do not have the same type' # if the two objects are dictionaries, then we can handle the case that # the lengths are not equal: if len(r1) != len(r2) : print 'unequal lengths for r1 and r2' if type(r1) != type({}) : raise ValueError, 'Cannot handle unequal length when it is not a dict' keys1 = r1.keys() keys2 = r2.keys() common = misc.intersect(keys1, keys2) r1new = {} r2new = {} for key in common : r1new[key] = r1[key] r2new[key] = r2[key] r1 = r1new r2 = r2new if type(r1) == type({}) : if r1.keys() != r2.keys() : raise ValueError, 'r1 and r2 do not have the same keys' I = r1.keys() else : I = range(len(r1)) if r1[I[0]].__class__.__name__ == 'Results' or r1[I[0]].__class__.__name__ == 'Container' : p1 = misc.extractAttribute(r1, statistic) p2 = misc.extractAttribute(r2, statistic) else : p1 = r1 p2 = r2 if type(p1) == type({}) : p1 = p1.values() p2 = p2.values() #import stats import salstat_stats test = salstat_stats.TwoSampleTests(p1, p2) test.SignedRanks (p1, p2) p = test.prob median1 = numpy.median(numpy.array(p1)) median2 = numpy.median(numpy.array(p2)) return p, (median1,median2)
def scatter(r1, r2, statistic = 'roc', x1Label = '', x2Label= '', fileName = None, **args) : """ a scatter plot for comparing the performance of two classifiers :Parameters: - `r1, r2` - both are either a list of Result classes, or a list of success rates / ROC scores - `statistic` - which measure of classifier success to plot values : 'roc', 'successRate', 'balancedSuccessRate' in order to specify parts of the roc curve you can use something like: 'roc50' or 'roc0.1' :Keywords: - `title` - the title of the plot """ if len(r1) != len(r2) : print 'unequal lengths for r1 and r2' if type(r1) != type({}) : raise ValueError, 'Cannot handle unequal length when it is not a dict' keys1 = r1.keys() keys2 = r2.keys() common = misc.intersect(keys1, keys2) r1new = {} r2new = {} for key in common : r1new[key] = r1[key] r2new[key] = r2[key] r1 = r1new r2 = r2new if type(r1) == type({}) and type(r2) == type({}) : I = r1.keys() else : I = range(len(r1)) if (r1[I[0]].__class__.__name__ == 'Results' or r1[I[0]].__class__.__name__ == 'Container') : p1 = misc.extractAttribute(r1, statistic) p2 = misc.extractAttribute(r2, statistic) else : p1 = r1 p2 = r2 if type(p1) == type({}) : p1 = p1.values() p2 = p2.values() from matplotlib import pylab x = numpy.arange(0,1,0.01) pylab.plot(p1, p2, 'bo',x,x, '-k') pylab.xlabel(x1Label, fontsize = 18) pylab.ylabel(x2Label, fontsize = 18) if 'title' in args : pylab.title(args['title'], fontsize = 18) pylab.show() if fileName is not None : pylab.savefig(fileName) pylab.close()