Пример #1
0
def scatter(r1, r2, statistic="roc", x1Label="", x2Label="", fileName=None, **args):
    """
    a scatter plot for comparing the performance of two classifiers

    :Parameters:
      - `r1, r2` - both are either a list of Result classes, or a list of
        success rates / ROC scores
      - `statistic` - which measure of classifier success to plot
        values : 'roc', 'successRate', 'balancedSuccessRate'
	in order to specify parts of the roc curve you can use something like:
	'roc50' or 'roc0.1'

    :Keywords:
      - `title` - the title of the plot
    """

    if len(r1) != len(r2):
        print "unequal lengths for r1 and r2"
        if type(r1) != type({}):
            raise ValueError, "Cannot handle unequal length when it is not a dict"
        keys1 = r1.keys()
        keys2 = r2.keys()
        common = misc.intersect(keys1, keys2)
        r1new = {}
        r2new = {}
        for key in common:
            r1new[key] = r1[key]
            r2new[key] = r2[key]
        r1 = r1new
        r2 = r2new

    if type(r1) == type({}) and type(r2) == type({}):
        I = r1.keys()
    else:
        I = range(len(r1))

    if r1[I[0]].__class__.__name__ == "Results" or r1[I[0]].__class__.__name__ == "Container":
        p1 = misc.extractAttribute(r1, statistic)
        p2 = misc.extractAttribute(r2, statistic)
    else:
        p1 = r1
        p2 = r2

    if type(p1) == type({}):
        p1 = p1.values()
        p2 = p2.values()

    from matplotlib import pylab

    x = numpy.arange(0, 1, 0.01)
    pylab.plot(p1, p2, "bo", x, x, "-k")
    pylab.xlabel(x1Label, fontsize=18)
    pylab.ylabel(x2Label, fontsize=18)
    if "title" in args:
        pylab.title(args["title"], fontsize=18)
    pylab.show()

    if fileName is not None:
        pylab.savefig(fileName)
        pylab.close()
Пример #2
0
def extractNumFeatures(resultsFileName) :

    r = myio.load(resultsFileName)

    numFeatures = {}
    if type(r) == type({}) :
        info = misc.extractAttribute(r, 'foldInfo')
        for key in info :
            numFeat = []
            for lines in info[key] :
                for line in lines.split('\n') :
                    if line.find('number of features') == 0 :
                        numFeat.append(float(line.split(':')[1]))
            numFeatures[key] = numpy.average(numFeat)
    return numFeatures
Пример #3
0
def extractNumFeatures(resultsFileName):

    r = myio.load(resultsFileName)

    numFeatures = {}
    if type(r) == type({}):
        info = misc.extractAttribute(r, 'foldInfo')
        for key in info:
            numFeat = []
            for lines in info[key]:
                for line in lines.split('\n'):
                    if line.find('number of features') == 0:
                        numFeat.append(float(line.split(':')[1]))
            numFeatures[key] = numpy.average(numFeat)
    return numFeatures
Пример #4
0
def significance(r1, r2, statistic = 'roc') :
    """
    report the statistical significance of the difference in error rates
    of a series of classification results of two classifiers
    using the Wilcoxon signed rank test.

    Returns: pvalue, (median1, median2)
    where:
    pvalue - the pvalue of the two sided Wilcoxon signed rank test; to get
    the pvalue of a one sided test divide the pvalue by two.
    (median1, median2) - the median of the statistics of the inputs r1 and r2.

    :Parameters:
      - `r1, r2` - both are either a list of Result classes, or a list of success
        rates
      - `statistic` - which measure of classifier success to plot
        values : 'roc', 'successRate', 'balancedSuccessRate'
	in order to specify parts of the roc curve you can use something like:
	'roc50' or 'roc0.1'

    """

    if type(r1) != type(r2) :
        raise ValueError, 'r1 and r2 do not have the same type'

    # if the two objects are dictionaries, then we can handle the case that
    # the lengths are not equal:
    if len(r1) != len(r2) :
        print 'unequal lengths for r1 and r2'
        if type(r1) != type({}) :
            raise ValueError, 'Cannot handle unequal length when it is not a dict'
        keys1 = r1.keys()
        keys2 = r2.keys()
        common = misc.intersect(keys1, keys2)
        r1new = {}
        r2new = {}
        for key in common :
            r1new[key] = r1[key]
            r2new[key] = r2[key]
        r1 = r1new
        r2 = r2new

    if type(r1) == type({}) :
        if r1.keys() != r2.keys() :
            raise ValueError, 'r1 and r2 do not have the same keys'
        I = r1.keys()
    else :
        I = range(len(r1))
    if r1[I[0]].__class__.__name__ == 'Results' or r1[I[0]].__class__.__name__ == 'Container' :
        p1 = misc.extractAttribute(r1, statistic)
        p2 = misc.extractAttribute(r2, statistic)
    else :
        p1 = r1
        p2 = r2

    if type(p1) == type({}) :
        p1 = p1.values()
        p2 = p2.values()

    #import stats
    
    import salstat_stats
    test = salstat_stats.TwoSampleTests(p1, p2)
    test.SignedRanks (p1, p2)

    p = test.prob
    median1 = numpy.median(numpy.array(p1))
    median2 = numpy.median(numpy.array(p2))

    return p, (median1,median2)
Пример #5
0
def scatter(r1, r2, statistic = 'roc', x1Label = '', x2Label= '',
            fileName = None, **args) :
    """
    a scatter plot for comparing the performance of two classifiers

    :Parameters:
      - `r1, r2` - both are either a list of Result classes, or a list of
        success rates / ROC scores
      - `statistic` - which measure of classifier success to plot
        values : 'roc', 'successRate', 'balancedSuccessRate'
	in order to specify parts of the roc curve you can use something like:
	'roc50' or 'roc0.1'

    :Keywords:
      - `title` - the title of the plot
    """

    if len(r1) != len(r2) :
        print 'unequal lengths for r1 and r2'
        if type(r1) != type({}) :
            raise ValueError, 'Cannot handle unequal length when it is not a dict'
        keys1 = r1.keys()
        keys2 = r2.keys()
        common = misc.intersect(keys1, keys2)
        r1new = {}
        r2new = {}
        for key in common :
            r1new[key] = r1[key]
            r2new[key] = r2[key]
        r1 = r1new
        r2 = r2new
        
    if type(r1) == type({}) and type(r2) == type({}) :
        I = r1.keys()
    else :
        I = range(len(r1))

    if (r1[I[0]].__class__.__name__ == 'Results' or
        r1[I[0]].__class__.__name__ == 'Container') :
        p1 = misc.extractAttribute(r1, statistic)
        p2 = misc.extractAttribute(r2, statistic)
    else :
        p1 = r1
        p2 = r2
        
    if type(p1) == type({}) :
        p1 = p1.values()
        p2 = p2.values()

    from matplotlib import pylab

    x = numpy.arange(0,1,0.01)
    pylab.plot(p1, p2, 'bo',x,x, '-k')
    pylab.xlabel(x1Label, fontsize = 18)
    pylab.ylabel(x2Label, fontsize = 18)
    if 'title' in args :
        pylab.title(args['title'], fontsize = 18)
    pylab.show()

    if fileName is not None :
        pylab.savefig(fileName)
	pylab.close()
def significance(r1, r2, statistic = 'roc') :
    """
    report the statistical significance of the difference in error rates
    of a series of classification results of two classifiers
    using the Wilcoxon signed rank test.

    Returns: pvalue, (median1, median2)
    where:
    pvalue - the pvalue of the two sided Wilcoxon signed rank test; to get
    the pvalue of a one sided test divide the pvalue by two.
    (median1, median2) - the median of the statistics of the inputs r1 and r2.

    :Parameters:
      - `r1, r2` - both are either a list of Result classes, or a list of success
        rates
      - `statistic` - which measure of classifier success to plot
        values : 'roc', 'successRate', 'balancedSuccessRate'
	in order to specify parts of the roc curve you can use something like:
	'roc50' or 'roc0.1'

    """

    if type(r1) != type(r2) :
        raise ValueError, 'r1 and r2 do not have the same type'

    # if the two objects are dictionaries, then we can handle the case that
    # the lengths are not equal:
    if len(r1) != len(r2) :
        print 'unequal lengths for r1 and r2'
        if type(r1) != type({}) :
            raise ValueError, 'Cannot handle unequal length when it is not a dict'
        keys1 = r1.keys()
        keys2 = r2.keys()
        common = misc.intersect(keys1, keys2)
        r1new = {}
        r2new = {}
        for key in common :
            r1new[key] = r1[key]
            r2new[key] = r2[key]
        r1 = r1new
        r2 = r2new

    if type(r1) == type({}) :
        if r1.keys() != r2.keys() :
            raise ValueError, 'r1 and r2 do not have the same keys'
        I = r1.keys()
    else :
        I = range(len(r1))
    if r1[I[0]].__class__.__name__ == 'Results' or r1[I[0]].__class__.__name__ == 'Container' :
        p1 = misc.extractAttribute(r1, statistic)
        p2 = misc.extractAttribute(r2, statistic)
    else :
        p1 = r1
        p2 = r2

    if type(p1) == type({}) :
        p1 = p1.values()
        p2 = p2.values()

    #import stats
    
    import salstat_stats
    test = salstat_stats.TwoSampleTests(p1, p2)
    test.SignedRanks (p1, p2)

    p = test.prob
    median1 = numpy.median(numpy.array(p1))
    median2 = numpy.median(numpy.array(p2))

    return p, (median1,median2)