예제 #1
0
# significance ** is a measure of the probability that, for whatever
# reason, we stumbled upon the results we did by chance.
#
# There are several tests for statistical significance, each applying
# to a different question.  Our question is: "Is the difference
# between the average height of people in town 1 and town 2
# statistically significant?"  We ask a similar question about the
# difference in average campaign contributions.  The test that
# answers this question is the
# [T-Test](https://en.wikipedia.org/wiki/Student's_t-test).  There are
# several flavors of T-Test and we will discuss these soon, but for
# now we'll focus on Welch's T-Test.

import welchttest

print "Welch's T-Test p-value:", welchttest.ttest(town1_heights, town2_heights)

# The Welch's T-Test emitted a p-value of ** .349 **.  A p-value is the
# probability that the effect size of .479 feet between town 1 and town
# 2 happened by chance.  In this case, there's 34.9% chance that we've
# arrived at our effect size by chance.
#
# What's a good cutoff for p-values to know whether we should trust
# the effect size we're seeing?  Two popular values are .05 or .01: if
# there is less than a 5% or 1% chance that we arrived at our answer
# by chance, we're willing to say that we have a ** statistically
# significant ** result.
#
# So in our case, our result is not significant.  Had we taken more
# measurements, or if the differences in heights were farther apart,
# we might have reached significance.  But, given our current results,
예제 #2
0
파일: exercise4.py 프로젝트: sronen/dataiap
import csv,sys
import welchttest

reader = csv.DictReader( open(sys.argv[1], 'r') )

obama_don = []
mccain_don = []

for row in reader:
	name = row['cand_nm']
	amount = float(row['contb_receipt_amt']) # need a number, not string! Otherwise boxplot wouldn't run!
	
	if 'Obama' in name:
		obama_don.append(float(amount))
	elif 'McCain' in name:
		mccain_don.append(float(amount))

print "Welch's T-Test p-value:", welchttest.ttest(obama_don, mccain_don)
# significance ** is a measure of the probability that, for whatever
# reason, we stumbled upon the results we did by chance.
#
# There are several tests for statistical significance, each applying
# to a different question.  Our question is: "Is the difference
# between the average height of people in town 1 and town 2
# statistically significant?"  We ask a similar question about the
# difference in average campaign contributions.  The test that
# answers this question is the
# [T-Test](https://en.wikipedia.org/wiki/Student's_t-test).  There are
# several flavors of T-Test and we will discuss these soon, but for
# now we'll focus on Welch's T-Test.

import welchttest

print "Welch's T-Test p-value:", welchttest.ttest(town1_heights, town2_heights)

# The Welch's T-Test emitted a p-value of ** .349 **.  A p-value is the
# probability that the effect size of .479 feet between town 1 and town
# 2 happened by chance.  In this case, there's 34.9% chance that we've
# arrived at our effect size by chance.
#
# What's a good cutoff for p-values to know whether we should trust
# the effect size we're seeing?  Two popular values are .05 or .01: if
# there is less than a 5% or 1% chance that we arrived at our answer
# by chance, we're willing to say that we have a ** statistically
# significant ** result.
#
# So in our case, our result is not significant.  Had we taken more
# measurements, or if the differences in heights were farther apart,
# we might have reached significance.  But, given our current results,
예제 #4
0
buckets = range(int(min_amt), int(max_amt), bucket_size)

# bar plot
fig = plt.figure(figsize=(30,10))
sub = fig.add_subplot(111)
width = 50
sub.bar(obamadonations_hist.keys(), obamadonations_hist.values(),
        color='b', width=width, label='Obama Donations')
sub.bar([amt+width for amt in mccaindonations_hist.keys()], mccaindonations_hist.values(), 
        color='r', width=width, label='McCain Donations')
sub.legend(loc='top left', ncol=1)
sub.set_xlim((-20000, 20000))
plt.savefig('day3/donations_histgrams.png', format='png')


# box plot
fig = plt.figure(figsize=(10,6))
sub = fig.add_subplot(111)
sub.boxplot([obamadonations, mccaindonations], whis=1)
sub.set_xticklabels(("Obama", "McCain"))
sub.set_ylim((-250, 1250))
sub.set_title('Obama Vs. McCain Donation')
plt.savefig('day3/donations_boxplot.png', format='png')


# significance test with Welch's T-Test
import sys
sys.path.append('day3/')
import welchttest
print "Welch's T-Test p-value:", welchttest.ttest(obamadonations, mccaindonations)
예제 #5
0
amount2 = 0

for row in reader:
    name = row['cand_nm']
    datestr = row['contb_receipt_dt']
    amount = float(row['contb_receipt_amt'])
    date = datetime.datetime.strptime(datestr, '%d-%b-%y')
    receipt_desc = row['receipt_desc']
    if True:
        if 'Obama' in name:
            obamadonations.append(amount)
        if 'McCain' in name:
            mccaindonations.append(amount)

plt.ylim((-250, 1250))
plt.boxplot([obamadonations, mccaindonations], whis=1)

import welchttest
print("Welch's T-Test p-value:",
      welchttest.ttest(obamadonations, mccaindonations))[1]

import scipy.stats

print "Mann-Whitney U p-value", scipy.stats.mannwhitneyu(
    obamadonations, mccaindonations)[1]
#def sorted_by_date(donation_by_date):
#    return(sorted(donation_by_date.items(),key=lambda(key,val):key))

#sorted_by_date_obama=sorted_by_date(obamadonations)
#sorted_by_date_mccain=sorted_by_date(mccaindonations)