示例#1
0
def MakeHists(live):
    """Plot Hists for live births

    live: DataFrame
    others: DataFrame
    """
    hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb')
    thinkplot.Hist(hist)
    thinkplot.Save(root='first_wgt_lb_hist',
                   xlabel='pounds',
                   ylabel='frequency',
                   axis=[-1, 14, 0, 3200])

    hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz')
    thinkplot.Hist(hist)
    thinkplot.Save(root='first_wgt_oz_hist',
                   xlabel='ounces',
                   ylabel='frequency',
                   axis=[-1, 16, 0, 1200])

    hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg')
    thinkplot.Hist(hist)
    thinkplot.Save(root='first_agepreg_hist',
                   xlabel='years',
                   ylabel='frequency')

    hist = thinkstats2.Hist(live.prglngth, label='prglngth')
    thinkplot.Hist(hist)
    thinkplot.Save(root='first_prglngth_hist',
                   xlabel='weeks',
                   ylabel='frequency',
                   axis=[-1, 53, 0, 5000])
示例#2
0
def printFirstVsOthersHistograms():
    first_hist = thinkstats2.Hist(firsts.prglngth)
    other_hist = thinkstats2.Hist(others.prglngth)

    width = 0.45
    thinkplot.PrePlot(2)
    thinkplot.Hist(first_hist, align='right', width=width, label='first')
    thinkplot.Hist(other_hist, align='left', width=width, label='others')

    thinkplot.Show(xlabel='weeks', ylabel='frequency', xlim=[27, 46])
示例#3
0
    def testHist(self):
        hist = thinkstats2.Hist('allen')
        self.assertEquals(len(hist), 4)
        self.assertEquals(hist.Freq('l'), 2)

        hist = thinkstats2.Hist(Counter('allen'))
        self.assertEquals(len(hist), 4)
        self.assertEquals(hist.Freq('l'), 2)

        hist2 = thinkstats2.Hist('nella')
        self.assertEquals(hist, hist2)
示例#4
0
def firstBabies(first,others):
    first_hist = thinkstats2.Hist(first.prglngth)
    others_hist = thinkstats2.Hist(others.prglngth)
    width = 0.45 ;
    #thinkplot.preplot(2) ;
    #thinkplot.Hist(first_hist, align='right', width=width)
    #thinkplot.Hist(others_hist, align='left', width=width)
    #thinkplot.Show(xlabel='weeks', ylabel='frequency', xlim=[27, 46])
    first_mean = first.prglngth.mean() ;
    other_mean = others.prglngth.mean() ;
    print("Mean of Pregnancy length of first born :",first_mean ) ;
    print("Mean of Pregnancy length of other than first born :", other_mean);
    pct_mean = (abs(first_mean - other_mean)*100)/other_mean ;
    print("% change between first and other born : ",pct_mean) ;
def print_num_albums_per_artist(all_genres):
    num_albums_counts = {}
    num_albums_list = []
    for artist, albums in all_genres.items():
        num_albums = len(albums)
        num_albums_list.append(num_albums)

        if num_albums in num_albums_counts:
            num_albums_counts[num_albums] += 1
        else:
            num_albums_counts[num_albums] = 1

    num_artists = len(all_genres)
    num_albums = sum(num_albums_list)
    print("In total,", num_artists, "artists, producing", num_albums,
          "albums.")
    print("An average of", "%.2f" % (num_albums / num_artists),
          "albums per artist.")

    num_albums_hist = ts2.Hist(num_albums_counts)
    artists_more_than_6_albums = sum(
        [v for k, v in num_albums_hist.Items() if k > 6])

    print(artists_more_than_6_albums, 'artists with more than 6 albums.')

    tp.Hist(num_albums_hist)
    tp.Show(xlabel='Number of albums',
            ylabel='Count of artists with this number of albums',
            title='Histogram of the number of albums per artist')
示例#6
0
def ex3():
    def VertLine(x, y=1):
        thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3)

    lam = 4
    goal_totals = [SimulateGame(lam=lam) for _ in range(1000)]
    print('RMSE', RMSE(goal_totals, lam))
    hist = thinkstats2.Hist(goal_totals)
    cdf = thinkstats2.Cdf(goal_totals)
    thinkplot.PrePlot(rows=2, cols=2)
    thinkplot.SubPlot(1)
    thinkplot.Hist(hist)
    thinkplot.SubPlot(2)
    thinkplot.Cdf(cdf)
    VertLine(cdf.Percentile(5))
    VertLine(cdf.Percentile(95))
    thinkplot.SubPlot(3)

    # lambda vs. rmse
    # rmse goes up as lambda goes up
    lams = range(1, 15)
    rmses = [RMSE([SimulateGame(lam=l) for _ in range(1000)], l) for l in lams]
    thinkplot.Plot(lams, rmses)
    thinkplot.SubPlot(4)

    # m vs. rmse
    # maybe rmse very slowly goes down as m goes up?
    # not at all clear that's really the case...
    ms = np.arange(10, 1000, 10)
    rmses = [RMSE([SimulateGame() for _ in range(m)], 4) for m in ms]
    thinkplot.Plot(ms, rmses)

    thinkplot.show()
示例#7
0
def EstimateHazardFunction(complete, ongoing, label='', shift=1e-7):
    """Estimates the hazard function by Kaplan-Meier.

    http://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator

    complete: list of complete lifetimes
    ongoing: list of ongoing lifetimes
    label: string
    shift: presumed additional survival of ongoing
    """
    # pmf and sf of complete lifetimes
    n = len(complete)
    hist_complete = thinkstats2.Hist(complete)
    sf_complete = SurvivalFunction(thinkstats2.Cdf(complete))

    # sf for ongoing lifetimes
    # The shift is a regrettable hack needed to deal with simultaneity.
    # If a case is complete at some t and another case is ongoing
    # at t, we presume that the ongoing case exceeds t+shift.
    m = len(ongoing)
    cdf = thinkstats2.Cdf(ongoing).Shift(shift)
    sf_ongoing = SurvivalFunction(cdf)

    lams = {}
    for t, ended in sorted(hist_complete.Items()):
        at_risk = ended + n * sf_complete[t] + m * sf_ongoing[t]
        lams[t] = ended / at_risk
        #print(t, ended, n * sf_complete[t], m * sf_ongoing[t], at_risk)

    return HazardFunction(lams, label=label)
示例#8
0
 def RunModel(self):
     df.tripduration, df.temperature = self.data
     n = df.tripduration + df.temperature
     sample = [random.choice('HT') for _ in range(n)]
     hist = thinkstats2.Hist(sample)
     data = hist['H'], hist['T']
     return data
示例#9
0
 def RunModel(self):
     heads, tails = self.data
     n = heads + tails
     sample = [random.choice('HT') for _ in range(n)]
     hist = thinkstats2.Hist(sample)
     data = hist['H'], hist['T']
     return data
示例#10
0
 def RunModel(self):
     n = sum(self.data)
     values = [1, 2, 3, 4, 5, 6]
     rolls = np.random.choice(values, n, replace=True)
     hist = thinkstats2.Hist(rolls)
     freqs = hist.Freqs(values)
     return freqs
示例#11
0
def main(script):
    """Tests the functions in this module.

    script: string script name
    """
    live, firsts, others = first.MakeFrames()
    hist = thinkstats2.Hist(live.prglngth)

    # test Mode
    mode = Mode(hist)
    print('Mode of preg length', mode)
    assert mode == 39, mode

    # test AllModes
    modes = AllModes(hist)
    assert modes[0][1] == 4693, modes[0][1]

    for value, freq in modes[:5]:
        print(value, freq)

    firsts_wgt = firsts.totalwgt_lb.mean()
    others_wgt = others.totalwgt_lb.mean()
    print("firsts = {} pounds, others = {} pounds, dif = {} pounds ".format(
        firsts_wgt, others_wgt, firsts_wgt - others_wgt))

    d = thinkstats2.CohenEffectSize(firsts.totalwgt_lb, others.totalwgt_lb)
    print(d)

    print('%s: All tests passed.' % script)
示例#12
0
def main(script):
    """Tests the functions in this module.

    script: string script name
    """
    live, firsts, others = first.MakeFrames()
    hist = thinkstats2.Hist(live.prglngth)

    # test Mode    
    mode = Mode(hist)
    print('Mode of preg length', mode)
    assert(mode == 39)

    # test AllModes
    modes = AllModes(hist)
    assert(modes[0][1] == 4693)

    for value, freq in modes[:5]:
        print(value, freq)

    d1 = WeightDifferences(firsts, others, live)
    print("Cohens'd Effect of weight differences:", d1)

    d2 = PregnancyLengthDifferences(firsts, others)
    print("Cohens'd Effect of pregnancy length differences:", d2)

    print('%s: All tests passed.' % script)
示例#13
0
def main(script):
    """Tests the functions in this module.

    script: string script name
    """
    live, firsts, others = first.MakeFrames()
    hist = thinkstats2.Hist(live.prglngth)

    # test weight_comparison
    weight_comparison(live, firsts, others)
    preg_length_comparison(live, firsts, others)

    # test Mode
    mode = Mode(hist)
    print('Mode of preg length', mode)
    assert mode == 39, mode

    # test AllModes
    modes = AllModes(hist)
    assert modes[0][1] == 4693, modes[0][1]

    for value, freq in modes[:5]:
        print(value, freq)

    print('%s: All tests passed.' % script)
示例#14
0
def MakeHists(greq, less):
    """Plot Hists for live births

    live: DataFrame
    others: DataFrame
    """
    hist = thinkstats2.Hist(greq.prglngth, label='prglngth')
    thinkplot.Hist(hist)
    thinkplot.Save(root='greq_prglngth_hist',
                   xlabel='weeks',
                   ylabel='frequency',
                   axis=[-1, 53, 0, 1000])

    hist = thinkstats2.Hist(less.prglngth, label='prglngth')
    thinkplot.Hist(hist)
    thinkplot.Save(root='less_prglngth_hist',
                   xlabel='weeks',
                   ylabel='frequency',
                   axis=[-1, 53, 0, 5000])
示例#15
0
def PairwiseDiffInPrglngthOfSameResp(preg_map, preg):
    """ select respondents who have at least two live births and compute pairwise differences."""
    hist = thinkstats2.Hist()

    for caseid, indices in preg_map.items():
        if len(indices) >= 2:
            pair = preg.loc[indices[0:2]].prglngth
            diff = np.diff(pair)[0]
            hist[diff] += 1
    thinkplot.Hist(hist)
示例#16
0
def MakeComparison(firsts, others):
    """Plots histograms of pregnancy length for first babies and others.

    firsts: DataFrame
    others: DataFrame
    """
    first_hist = thinkstats2.Hist(firsts.prglngth, label='first')
    other_hist = thinkstats2.Hist(others.prglngth, label='other')

    width = 0.45
    thinkplot.PrePlot(2)
    thinkplot.Hist(first_hist, align='right', width=width)
    thinkplot.Hist(other_hist, align='left', width=width)

    thinkplot.Save(root='first_nsfg_hist',
                   title='Histogram',
                   xlabel='weeks',
                   ylabel='frequency',
                   axis=[27, 46, 0, 2700])
示例#17
0
def ProbilityMassFunction(group):
    hist = thinkstats2.Hist(group)
    n = hist.Total()
    map_prob = {}
    for x, v in hist.Items():
        map_prob[x] = v / n
    #OR
    pmf = thinkstats2.Pmf(group)
    #print(type(map_prob) ," AND ", type(pmf));
    return pmf
示例#18
0
    def RunModel(self):
        """Run the model of the null hypothesis.

        returns: simulated data
        """
        heads, tails = self.data
        n = heads + tails
        sample = [random.choice('HT') for _ in range(n)]
        hist = thinkstats2.Hist(sample)
        data = hist['H'], hist['T']
        return data
示例#19
0
    def RunModel(self):
        """Run the model of the null hypothesis.

        returns: simulated data
        """
        n = sum(self.data)
        values = [1,2,3,4,5,6]
        rolls = np.random.choice(values, n, replace=True)
        hist = thinkstats2.Hist(rolls)
        freqs = hist.Freqs(values)
        return freqs
示例#20
0
    def ChiSquared(self, lengths):
        """Computes the chi-squared statistic.
        
        lengths: sequence of lengths

        returns: float
        """
        hist = thinkstats2.Hist(lengths)
        observed = np.array(hist.Freqs(self.values))
        expected = self.expected_probs * len(lengths)
        stat = sum((observed - expected)**2 / expected)
        return stat
示例#21
0
def MakeHists(male, female):
    """Plot Hists for live births

    live: DataFrame
    others: DataFrame
    """
    thinkplot.PrePlot(rows=1, cols=2)
    hist = thinkstats2.Hist(male.alcwknd)
    thinkplot.SubPlot(1)
    thinkplot.Config(axis=[0, 800, 0, 600],
                     ylabel='Number of people',
                     xlabel='Alcohol consumed (grams)',
                     title='Weekend Alcohol Consumption for Men')
    thinkplot.Hist(hist, alpha=1)

    hist = thinkstats2.Hist(female.alcwknd)
    thinkplot.SubPlot(2)
    thinkplot.Config(axis=[0, 800, 0, 1200],
                     ylabel='Number of people',
                     xlabel='Alcohol consumed (grams)',
                     title='Weekend Alcohol Consumption for Women')
    thinkplot.Hist(hist, alpha=1)
    thinkplot.Show()
示例#22
0
def main():

    live,first,other = MakeDataframes() ;

    #plot  the histogram of birthwgt_lb for live births.
    #drawHist(live.birthwgt_oz,'birthwgt_oz','Ounce','Frequency') ;
    #drawHist(live.agepreg, 'agepreg','Pregnancy Age','Frequency');
    #drawHist(live.prglngth,'prglngth','Pregnancy Length','Frequency')
    firstBabies(first,other) ;
    print("Mean for Live birth : ",live.prglngth.mean()) ;
    print("Variance for Live birth : ",live.prglngth.var());
    print("Standard Deviation for Live birth : ",live.prglngth.std());
    cohen_d = CohenEffectSize(first.prglngth,other.prglngth) ;
    print("Diff in mean per standard deviation : ", cohen_d) ;

    #exercise 2.3
    hist = thinkstats2.Hist(live.prglngth) ;
    ex_que2_3(hist)
    que2_4(first,other) ;
示例#23
0
def MakeHists(live):
    """Plot Hists for live births

    live: DataFrame
    others: DataFrame
    """
    hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg')
    thinkplot.PrePlot(2, cols=2)

    thinkplot.SubPlot(1)
    thinkplot.Hist(hist)
    thinkplot.Config(xlabel='years', ylabel='frequency', axis=[0, 45, 0, 700])

    thinkplot.SubPlot(2)
    thinkplot.Pmf(hist)

    thinkplot.Save(root='probability_agepreg_hist',
                   xlabel='years',
                   axis=[0, 45, 0, 700])
示例#24
0
def PrintExtremes(live):
    """Plots the histogram of pregnancy lengths and prints the extremes.

    live: DataFrame of live births
    """
    hist = thinkstats2.Hist(live.prglngth)
    thinkplot.Hist(hist, label='live births')

    thinkplot.Save(root='first_nsfg_hist_live',
                   title='Histogram',
                   xlabel='weeks',
                   ylabel='frequency')

    print('Shortest lengths:')
    for weeks, freq in hist.Smallest(10):
        print(weeks, freq)

    print('Longest lengths:')
    for weeks, freq in hist.Largest(10):
        print(weeks, freq)
示例#25
0
def main(script):
    """Tests the functions in this module.

    script: string script name
    """
    live, firsts, others = first.MakeFrames()
    hist = thinkstats2.Hist(live.prglngth)

    mean1 = firsts.totalwgt_lb.mean()
    mean2 = others.totalwgt_lb.mean()

    var1 = firsts.totalwgt_lb.var()
    var2 = others.totalwgt_lb.var()

    print('Mean Weight')
    print('First babies', mean1)
    print('Others babies', mean2)

    print('Variance in Weight')
    print('First babies', var1)
    print('Others babies', var2)

    print('Difference in lbs', mean1 - mean2)

    cohen_d = thinkstats2.CohenEffectSize(firsts.totalwgt_lb,
                                          others.totalwgt_lb)
    print('Cohen d', cohen_d)

    # test Mode
    mode = Mode(hist)
    print('Mode of preg length', mode)
    assert mode == 39, mode

    # test AllModes
    modes = AllModes(hist)
    assert modes[0][1] == 4693, modes[0][1]

    for value, freq in modes[:5]:
        print(value, freq)

    print('%s: All tests passed.' % script)
示例#26
0
def main(script):
    """Tests the functions in this module.

    script: string script name
    """
    live, firsts, others = first.MakeFrames()
    hist = thinkstats2.Hist(live.prglngth)

    # explore the weight difference between first babies and others
    WeightDifference(live, firsts, others)

    # test Mode
    mode = Mode(hist)
    print('Mode of preg length', mode)
    assert(mode == 39)

    # test AllModes
    modes = AllModes(hist)
    assert(modes[0][1] == 4693)

    for value, freq in modes[:5]:ies
        print(value, freq)
示例#27
0
def main(script):
    """Tests the functions in this module.

    script: string script name
    """
    live, firsts, others = first.MakeFrames()
    hist = thinkstats2.Hist(live.prglngth)

    # test Mode
    mode = Mode(hist)
    print('Mode of preg length', mode)
    assert (mode == 39)

    # test AllModes
    modes = AllModes(hist)
    print(modes)
    assert (modes[0][1] == 4693)

    for value, freq in modes[:5]:
        print(value, freq)

    print('%s: All tests passed.' % script)
    print("Cohen's d:", cohen_d(firsts.totalwgt_lb, others.totalwgt_lb))
示例#28
0
def EstimateHazardFunction(complete, ongoing, label=''):
    """Estimates the hazard function by Kaplan-Meier.

    http://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator

    complete: list of complete lifetimes
    ongoing: list of ongoing lifetimes
    label: string
    """
    # pmf and sf of complete lifetimes
    n = len(complete)
    hist_complete = thinkstats2.Hist(complete)
    sf_complete = SurvivalFunction(thinkstats2.Cdf(complete))

    # sf for ongoing lifetimes
    m = len(ongoing)
    sf_ongoing = SurvivalFunction(thinkstats2.Cdf(ongoing))

    lams = {}
    for t, ended in sorted(hist_complete.Items()):
        at_risk = ended + n * sf_complete[t] + m * sf_ongoing[t]
        lams[t] = ended / at_risk

    return HazardFunction(lams, label=label)
示例#29
0
for line in data:
    spLine = line.split(",")
    famList.append(spLine[4])
    romanList.append(spLine[22])
# remove first element
famList = famList[1:]
romanList = romanList[1:]

# calculate percentage of family size have three or less family members
famLE3 = famList.count("LE3") / float(len(famList))
print('family has three or less members percentage=', "{:.2f}".format(famLE3))
# calculate student in relationship percentage
romanticY = romanList.count("yes") / float(len(romanList))
print('student in relationship percentage=', "{:.2f}".format(romanticY))

famSizeHist = thinkstats2.Hist(famList, label='famsize')
romanList = thinkstats2.Hist(romanList, label='romantic')

# plot familiy size histogram
thinkplot.Hist(famSizeHist)
thinkplot.Show(xlabel='Value', ylabel='Frequency', title='Family Size Fig')
# plot romantic interest histogram
thinkplot.Hist(romanList)
thinkplot.Show(xlabel='Value',
               ylabel='Frequency',
               title='Romantic Interest Fig')

# Use One Sample T Test to valuate whether this data set is a good sample or not.
# Our null hypothesis is that: true_mu = 0
famList = map(lambda x: 1 if x == 'GT3' else 0, famList)
romanList = map(lambda x: 1 if x == 'yes' else 0, romanList)
示例#30
0
df['Sex'].replace('', np.nan, inplace=True)
df['Sex'].replace('Unknown', np.nan, inplace=True)
df.dropna(subset=['Sex'], inplace=True)
df['Race'].replace('', np.nan, inplace=True)
df['Race'].replace('Unknown', np.nan, inplace=True)
df['Race'].replace('Other', np.nan, inplace=True)
df.dropna(subset=['Race'], inplace=True)
df['Drug'].replace('', np.nan, inplace=True)
df['Drug'].replace('Unknown', np.nan, inplace=True)
df.dropna(subset=['Drug'], inplace=True)
#change data types
df['Year'] = df['Year'].astype(int)
df['Age'] = df['Age'].astype(int)

#create histograms and report characterisitics
histYear = thinkstats2.Hist(df.Year)
thinkplot.Hist(histYear)
print("mean is", df.Year.mean())
print("mode is", max(df.Year.mode()))
print("variance is", df.Year.var())
print("standard deviation is", df.Year.std())
histAge = thinkstats2.Hist(df.Age)
thinkplot.Hist(histAge, width=1)
print("mean is", df.Age.mean())
print("mode is", max(df.Age.mode()))
print("variance is", df.Age.var())
print("standard deviation is", df.Age.std())
histSex = thinkstats2.Hist(df.Sex)
thinkplot.Hist(histSex)
histRace = thinkstats2.Hist(df.Race)
thinkplot.Hist(histRace)