# significance ** is a measure of the probability that, for whatever # reason, we stumbled upon the results we did by chance. # # There are several tests for statistical significance, each applying # to a different question. Our question is: "Is the difference # between the average height of people in town 1 and town 2 # statistically significant?" We ask a similar question about the # difference in average campaign contributions. The test that # answers this question is the # [T-Test](https://en.wikipedia.org/wiki/Student's_t-test). There are # several flavors of T-Test and we will discuss these soon, but for # now we'll focus on Welch's T-Test. import welchttest print "Welch's T-Test p-value:", welchttest.ttest(town1_heights, town2_heights) # The Welch's T-Test emitted a p-value of ** .349 **. A p-value is the # probability that the effect size of .479 feet between town 1 and town # 2 happened by chance. In this case, there's 34.9% chance that we've # arrived at our effect size by chance. # # What's a good cutoff for p-values to know whether we should trust # the effect size we're seeing? Two popular values are .05 or .01: if # there is less than a 5% or 1% chance that we arrived at our answer # by chance, we're willing to say that we have a ** statistically # significant ** result. # # So in our case, our result is not significant. Had we taken more # measurements, or if the differences in heights were farther apart, # we might have reached significance. But, given our current results,
import csv,sys import welchttest reader = csv.DictReader( open(sys.argv[1], 'r') ) obama_don = [] mccain_don = [] for row in reader: name = row['cand_nm'] amount = float(row['contb_receipt_amt']) # need a number, not string! Otherwise boxplot wouldn't run! if 'Obama' in name: obama_don.append(float(amount)) elif 'McCain' in name: mccain_don.append(float(amount)) print "Welch's T-Test p-value:", welchttest.ttest(obama_don, mccain_don)
buckets = range(int(min_amt), int(max_amt), bucket_size) # bar plot fig = plt.figure(figsize=(30,10)) sub = fig.add_subplot(111) width = 50 sub.bar(obamadonations_hist.keys(), obamadonations_hist.values(), color='b', width=width, label='Obama Donations') sub.bar([amt+width for amt in mccaindonations_hist.keys()], mccaindonations_hist.values(), color='r', width=width, label='McCain Donations') sub.legend(loc='top left', ncol=1) sub.set_xlim((-20000, 20000)) plt.savefig('day3/donations_histgrams.png', format='png') # box plot fig = plt.figure(figsize=(10,6)) sub = fig.add_subplot(111) sub.boxplot([obamadonations, mccaindonations], whis=1) sub.set_xticklabels(("Obama", "McCain")) sub.set_ylim((-250, 1250)) sub.set_title('Obama Vs. McCain Donation') plt.savefig('day3/donations_boxplot.png', format='png') # significance test with Welch's T-Test import sys sys.path.append('day3/') import welchttest print "Welch's T-Test p-value:", welchttest.ttest(obamadonations, mccaindonations)
amount2 = 0 for row in reader: name = row['cand_nm'] datestr = row['contb_receipt_dt'] amount = float(row['contb_receipt_amt']) date = datetime.datetime.strptime(datestr, '%d-%b-%y') receipt_desc = row['receipt_desc'] if True: if 'Obama' in name: obamadonations.append(amount) if 'McCain' in name: mccaindonations.append(amount) plt.ylim((-250, 1250)) plt.boxplot([obamadonations, mccaindonations], whis=1) import welchttest print("Welch's T-Test p-value:", welchttest.ttest(obamadonations, mccaindonations))[1] import scipy.stats print "Mann-Whitney U p-value", scipy.stats.mannwhitneyu( obamadonations, mccaindonations)[1] #def sorted_by_date(donation_by_date): # return(sorted(donation_by_date.items(),key=lambda(key,val):key)) #sorted_by_date_obama=sorted_by_date(obamadonations) #sorted_by_date_mccain=sorted_by_date(mccaindonations)