def MakeHists(live): """Plot Hists for live births live: DataFrame others: DataFrame """ hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb') thinkplot.Hist(hist) thinkplot.Save(root='first_wgt_lb_hist', xlabel='pounds', ylabel='frequency', axis=[-1, 14, 0, 3200]) hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz') thinkplot.Hist(hist) thinkplot.Save(root='first_wgt_oz_hist', xlabel='ounces', ylabel='frequency', axis=[-1, 16, 0, 1200]) hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg') thinkplot.Hist(hist) thinkplot.Save(root='first_agepreg_hist', xlabel='years', ylabel='frequency') hist = thinkstats2.Hist(live.prglngth, label='prglngth') thinkplot.Hist(hist) thinkplot.Save(root='first_prglngth_hist', xlabel='weeks', ylabel='frequency', axis=[-1, 53, 0, 5000])
def printFirstVsOthersHistograms(): first_hist = thinkstats2.Hist(firsts.prglngth) other_hist = thinkstats2.Hist(others.prglngth) width = 0.45 thinkplot.PrePlot(2) thinkplot.Hist(first_hist, align='right', width=width, label='first') thinkplot.Hist(other_hist, align='left', width=width, label='others') thinkplot.Show(xlabel='weeks', ylabel='frequency', xlim=[27, 46])
def testHist(self): hist = thinkstats2.Hist('allen') self.assertEquals(len(hist), 4) self.assertEquals(hist.Freq('l'), 2) hist = thinkstats2.Hist(Counter('allen')) self.assertEquals(len(hist), 4) self.assertEquals(hist.Freq('l'), 2) hist2 = thinkstats2.Hist('nella') self.assertEquals(hist, hist2)
def firstBabies(first,others): first_hist = thinkstats2.Hist(first.prglngth) others_hist = thinkstats2.Hist(others.prglngth) width = 0.45 ; #thinkplot.preplot(2) ; #thinkplot.Hist(first_hist, align='right', width=width) #thinkplot.Hist(others_hist, align='left', width=width) #thinkplot.Show(xlabel='weeks', ylabel='frequency', xlim=[27, 46]) first_mean = first.prglngth.mean() ; other_mean = others.prglngth.mean() ; print("Mean of Pregnancy length of first born :",first_mean ) ; print("Mean of Pregnancy length of other than first born :", other_mean); pct_mean = (abs(first_mean - other_mean)*100)/other_mean ; print("% change between first and other born : ",pct_mean) ;
def print_num_albums_per_artist(all_genres): num_albums_counts = {} num_albums_list = [] for artist, albums in all_genres.items(): num_albums = len(albums) num_albums_list.append(num_albums) if num_albums in num_albums_counts: num_albums_counts[num_albums] += 1 else: num_albums_counts[num_albums] = 1 num_artists = len(all_genres) num_albums = sum(num_albums_list) print("In total,", num_artists, "artists, producing", num_albums, "albums.") print("An average of", "%.2f" % (num_albums / num_artists), "albums per artist.") num_albums_hist = ts2.Hist(num_albums_counts) artists_more_than_6_albums = sum( [v for k, v in num_albums_hist.Items() if k > 6]) print(artists_more_than_6_albums, 'artists with more than 6 albums.') tp.Hist(num_albums_hist) tp.Show(xlabel='Number of albums', ylabel='Count of artists with this number of albums', title='Histogram of the number of albums per artist')
def ex3(): def VertLine(x, y=1): thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3) lam = 4 goal_totals = [SimulateGame(lam=lam) for _ in range(1000)] print('RMSE', RMSE(goal_totals, lam)) hist = thinkstats2.Hist(goal_totals) cdf = thinkstats2.Cdf(goal_totals) thinkplot.PrePlot(rows=2, cols=2) thinkplot.SubPlot(1) thinkplot.Hist(hist) thinkplot.SubPlot(2) thinkplot.Cdf(cdf) VertLine(cdf.Percentile(5)) VertLine(cdf.Percentile(95)) thinkplot.SubPlot(3) # lambda vs. rmse # rmse goes up as lambda goes up lams = range(1, 15) rmses = [RMSE([SimulateGame(lam=l) for _ in range(1000)], l) for l in lams] thinkplot.Plot(lams, rmses) thinkplot.SubPlot(4) # m vs. rmse # maybe rmse very slowly goes down as m goes up? # not at all clear that's really the case... ms = np.arange(10, 1000, 10) rmses = [RMSE([SimulateGame() for _ in range(m)], 4) for m in ms] thinkplot.Plot(ms, rmses) thinkplot.show()
def EstimateHazardFunction(complete, ongoing, label='', shift=1e-7): """Estimates the hazard function by Kaplan-Meier. http://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator complete: list of complete lifetimes ongoing: list of ongoing lifetimes label: string shift: presumed additional survival of ongoing """ # pmf and sf of complete lifetimes n = len(complete) hist_complete = thinkstats2.Hist(complete) sf_complete = SurvivalFunction(thinkstats2.Cdf(complete)) # sf for ongoing lifetimes # The shift is a regrettable hack needed to deal with simultaneity. # If a case is complete at some t and another case is ongoing # at t, we presume that the ongoing case exceeds t+shift. m = len(ongoing) cdf = thinkstats2.Cdf(ongoing).Shift(shift) sf_ongoing = SurvivalFunction(cdf) lams = {} for t, ended in sorted(hist_complete.Items()): at_risk = ended + n * sf_complete[t] + m * sf_ongoing[t] lams[t] = ended / at_risk #print(t, ended, n * sf_complete[t], m * sf_ongoing[t], at_risk) return HazardFunction(lams, label=label)
def RunModel(self): df.tripduration, df.temperature = self.data n = df.tripduration + df.temperature sample = [random.choice('HT') for _ in range(n)] hist = thinkstats2.Hist(sample) data = hist['H'], hist['T'] return data
def RunModel(self): heads, tails = self.data n = heads + tails sample = [random.choice('HT') for _ in range(n)] hist = thinkstats2.Hist(sample) data = hist['H'], hist['T'] return data
def RunModel(self): n = sum(self.data) values = [1, 2, 3, 4, 5, 6] rolls = np.random.choice(values, n, replace=True) hist = thinkstats2.Hist(rolls) freqs = hist.Freqs(values) return freqs
def main(script): """Tests the functions in this module. script: string script name """ live, firsts, others = first.MakeFrames() hist = thinkstats2.Hist(live.prglngth) # test Mode mode = Mode(hist) print('Mode of preg length', mode) assert mode == 39, mode # test AllModes modes = AllModes(hist) assert modes[0][1] == 4693, modes[0][1] for value, freq in modes[:5]: print(value, freq) firsts_wgt = firsts.totalwgt_lb.mean() others_wgt = others.totalwgt_lb.mean() print("firsts = {} pounds, others = {} pounds, dif = {} pounds ".format( firsts_wgt, others_wgt, firsts_wgt - others_wgt)) d = thinkstats2.CohenEffectSize(firsts.totalwgt_lb, others.totalwgt_lb) print(d) print('%s: All tests passed.' % script)
def main(script): """Tests the functions in this module. script: string script name """ live, firsts, others = first.MakeFrames() hist = thinkstats2.Hist(live.prglngth) # test Mode mode = Mode(hist) print('Mode of preg length', mode) assert(mode == 39) # test AllModes modes = AllModes(hist) assert(modes[0][1] == 4693) for value, freq in modes[:5]: print(value, freq) d1 = WeightDifferences(firsts, others, live) print("Cohens'd Effect of weight differences:", d1) d2 = PregnancyLengthDifferences(firsts, others) print("Cohens'd Effect of pregnancy length differences:", d2) print('%s: All tests passed.' % script)
def main(script): """Tests the functions in this module. script: string script name """ live, firsts, others = first.MakeFrames() hist = thinkstats2.Hist(live.prglngth) # test weight_comparison weight_comparison(live, firsts, others) preg_length_comparison(live, firsts, others) # test Mode mode = Mode(hist) print('Mode of preg length', mode) assert mode == 39, mode # test AllModes modes = AllModes(hist) assert modes[0][1] == 4693, modes[0][1] for value, freq in modes[:5]: print(value, freq) print('%s: All tests passed.' % script)
def MakeHists(greq, less): """Plot Hists for live births live: DataFrame others: DataFrame """ hist = thinkstats2.Hist(greq.prglngth, label='prglngth') thinkplot.Hist(hist) thinkplot.Save(root='greq_prglngth_hist', xlabel='weeks', ylabel='frequency', axis=[-1, 53, 0, 1000]) hist = thinkstats2.Hist(less.prglngth, label='prglngth') thinkplot.Hist(hist) thinkplot.Save(root='less_prglngth_hist', xlabel='weeks', ylabel='frequency', axis=[-1, 53, 0, 5000])
def PairwiseDiffInPrglngthOfSameResp(preg_map, preg): """ select respondents who have at least two live births and compute pairwise differences.""" hist = thinkstats2.Hist() for caseid, indices in preg_map.items(): if len(indices) >= 2: pair = preg.loc[indices[0:2]].prglngth diff = np.diff(pair)[0] hist[diff] += 1 thinkplot.Hist(hist)
def MakeComparison(firsts, others): """Plots histograms of pregnancy length for first babies and others. firsts: DataFrame others: DataFrame """ first_hist = thinkstats2.Hist(firsts.prglngth, label='first') other_hist = thinkstats2.Hist(others.prglngth, label='other') width = 0.45 thinkplot.PrePlot(2) thinkplot.Hist(first_hist, align='right', width=width) thinkplot.Hist(other_hist, align='left', width=width) thinkplot.Save(root='first_nsfg_hist', title='Histogram', xlabel='weeks', ylabel='frequency', axis=[27, 46, 0, 2700])
def ProbilityMassFunction(group): hist = thinkstats2.Hist(group) n = hist.Total() map_prob = {} for x, v in hist.Items(): map_prob[x] = v / n #OR pmf = thinkstats2.Pmf(group) #print(type(map_prob) ," AND ", type(pmf)); return pmf
def RunModel(self): """Run the model of the null hypothesis. returns: simulated data """ heads, tails = self.data n = heads + tails sample = [random.choice('HT') for _ in range(n)] hist = thinkstats2.Hist(sample) data = hist['H'], hist['T'] return data
def RunModel(self): """Run the model of the null hypothesis. returns: simulated data """ n = sum(self.data) values = [1,2,3,4,5,6] rolls = np.random.choice(values, n, replace=True) hist = thinkstats2.Hist(rolls) freqs = hist.Freqs(values) return freqs
def ChiSquared(self, lengths): """Computes the chi-squared statistic. lengths: sequence of lengths returns: float """ hist = thinkstats2.Hist(lengths) observed = np.array(hist.Freqs(self.values)) expected = self.expected_probs * len(lengths) stat = sum((observed - expected)**2 / expected) return stat
def MakeHists(male, female): """Plot Hists for live births live: DataFrame others: DataFrame """ thinkplot.PrePlot(rows=1, cols=2) hist = thinkstats2.Hist(male.alcwknd) thinkplot.SubPlot(1) thinkplot.Config(axis=[0, 800, 0, 600], ylabel='Number of people', xlabel='Alcohol consumed (grams)', title='Weekend Alcohol Consumption for Men') thinkplot.Hist(hist, alpha=1) hist = thinkstats2.Hist(female.alcwknd) thinkplot.SubPlot(2) thinkplot.Config(axis=[0, 800, 0, 1200], ylabel='Number of people', xlabel='Alcohol consumed (grams)', title='Weekend Alcohol Consumption for Women') thinkplot.Hist(hist, alpha=1) thinkplot.Show()
def main(): live,first,other = MakeDataframes() ; #plot the histogram of birthwgt_lb for live births. #drawHist(live.birthwgt_oz,'birthwgt_oz','Ounce','Frequency') ; #drawHist(live.agepreg, 'agepreg','Pregnancy Age','Frequency'); #drawHist(live.prglngth,'prglngth','Pregnancy Length','Frequency') firstBabies(first,other) ; print("Mean for Live birth : ",live.prglngth.mean()) ; print("Variance for Live birth : ",live.prglngth.var()); print("Standard Deviation for Live birth : ",live.prglngth.std()); cohen_d = CohenEffectSize(first.prglngth,other.prglngth) ; print("Diff in mean per standard deviation : ", cohen_d) ; #exercise 2.3 hist = thinkstats2.Hist(live.prglngth) ; ex_que2_3(hist) que2_4(first,other) ;
def MakeHists(live): """Plot Hists for live births live: DataFrame others: DataFrame """ hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg') thinkplot.PrePlot(2, cols=2) thinkplot.SubPlot(1) thinkplot.Hist(hist) thinkplot.Config(xlabel='years', ylabel='frequency', axis=[0, 45, 0, 700]) thinkplot.SubPlot(2) thinkplot.Pmf(hist) thinkplot.Save(root='probability_agepreg_hist', xlabel='years', axis=[0, 45, 0, 700])
def PrintExtremes(live): """Plots the histogram of pregnancy lengths and prints the extremes. live: DataFrame of live births """ hist = thinkstats2.Hist(live.prglngth) thinkplot.Hist(hist, label='live births') thinkplot.Save(root='first_nsfg_hist_live', title='Histogram', xlabel='weeks', ylabel='frequency') print('Shortest lengths:') for weeks, freq in hist.Smallest(10): print(weeks, freq) print('Longest lengths:') for weeks, freq in hist.Largest(10): print(weeks, freq)
def main(script): """Tests the functions in this module. script: string script name """ live, firsts, others = first.MakeFrames() hist = thinkstats2.Hist(live.prglngth) mean1 = firsts.totalwgt_lb.mean() mean2 = others.totalwgt_lb.mean() var1 = firsts.totalwgt_lb.var() var2 = others.totalwgt_lb.var() print('Mean Weight') print('First babies', mean1) print('Others babies', mean2) print('Variance in Weight') print('First babies', var1) print('Others babies', var2) print('Difference in lbs', mean1 - mean2) cohen_d = thinkstats2.CohenEffectSize(firsts.totalwgt_lb, others.totalwgt_lb) print('Cohen d', cohen_d) # test Mode mode = Mode(hist) print('Mode of preg length', mode) assert mode == 39, mode # test AllModes modes = AllModes(hist) assert modes[0][1] == 4693, modes[0][1] for value, freq in modes[:5]: print(value, freq) print('%s: All tests passed.' % script)
def main(script): """Tests the functions in this module. script: string script name """ live, firsts, others = first.MakeFrames() hist = thinkstats2.Hist(live.prglngth) # explore the weight difference between first babies and others WeightDifference(live, firsts, others) # test Mode mode = Mode(hist) print('Mode of preg length', mode) assert(mode == 39) # test AllModes modes = AllModes(hist) assert(modes[0][1] == 4693) for value, freq in modes[:5]:ies print(value, freq)
def main(script): """Tests the functions in this module. script: string script name """ live, firsts, others = first.MakeFrames() hist = thinkstats2.Hist(live.prglngth) # test Mode mode = Mode(hist) print('Mode of preg length', mode) assert (mode == 39) # test AllModes modes = AllModes(hist) print(modes) assert (modes[0][1] == 4693) for value, freq in modes[:5]: print(value, freq) print('%s: All tests passed.' % script) print("Cohen's d:", cohen_d(firsts.totalwgt_lb, others.totalwgt_lb))
def EstimateHazardFunction(complete, ongoing, label=''): """Estimates the hazard function by Kaplan-Meier. http://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator complete: list of complete lifetimes ongoing: list of ongoing lifetimes label: string """ # pmf and sf of complete lifetimes n = len(complete) hist_complete = thinkstats2.Hist(complete) sf_complete = SurvivalFunction(thinkstats2.Cdf(complete)) # sf for ongoing lifetimes m = len(ongoing) sf_ongoing = SurvivalFunction(thinkstats2.Cdf(ongoing)) lams = {} for t, ended in sorted(hist_complete.Items()): at_risk = ended + n * sf_complete[t] + m * sf_ongoing[t] lams[t] = ended / at_risk return HazardFunction(lams, label=label)
for line in data: spLine = line.split(",") famList.append(spLine[4]) romanList.append(spLine[22]) # remove first element famList = famList[1:] romanList = romanList[1:] # calculate percentage of family size have three or less family members famLE3 = famList.count("LE3") / float(len(famList)) print('family has three or less members percentage=', "{:.2f}".format(famLE3)) # calculate student in relationship percentage romanticY = romanList.count("yes") / float(len(romanList)) print('student in relationship percentage=', "{:.2f}".format(romanticY)) famSizeHist = thinkstats2.Hist(famList, label='famsize') romanList = thinkstats2.Hist(romanList, label='romantic') # plot familiy size histogram thinkplot.Hist(famSizeHist) thinkplot.Show(xlabel='Value', ylabel='Frequency', title='Family Size Fig') # plot romantic interest histogram thinkplot.Hist(romanList) thinkplot.Show(xlabel='Value', ylabel='Frequency', title='Romantic Interest Fig') # Use One Sample T Test to valuate whether this data set is a good sample or not. # Our null hypothesis is that: true_mu = 0 famList = map(lambda x: 1 if x == 'GT3' else 0, famList) romanList = map(lambda x: 1 if x == 'yes' else 0, romanList)
df['Sex'].replace('', np.nan, inplace=True) df['Sex'].replace('Unknown', np.nan, inplace=True) df.dropna(subset=['Sex'], inplace=True) df['Race'].replace('', np.nan, inplace=True) df['Race'].replace('Unknown', np.nan, inplace=True) df['Race'].replace('Other', np.nan, inplace=True) df.dropna(subset=['Race'], inplace=True) df['Drug'].replace('', np.nan, inplace=True) df['Drug'].replace('Unknown', np.nan, inplace=True) df.dropna(subset=['Drug'], inplace=True) #change data types df['Year'] = df['Year'].astype(int) df['Age'] = df['Age'].astype(int) #create histograms and report characterisitics histYear = thinkstats2.Hist(df.Year) thinkplot.Hist(histYear) print("mean is", df.Year.mean()) print("mode is", max(df.Year.mode())) print("variance is", df.Year.var()) print("standard deviation is", df.Year.std()) histAge = thinkstats2.Hist(df.Age) thinkplot.Hist(histAge, width=1) print("mean is", df.Age.mean()) print("mode is", max(df.Age.mode())) print("variance is", df.Age.var()) print("standard deviation is", df.Age.std()) histSex = thinkstats2.Hist(df.Sex) thinkplot.Hist(histSex) histRace = thinkstats2.Hist(df.Race) thinkplot.Hist(histRace)