def ValidatePregnum(resp): """Validate pregnum in the respondent file. resp: respondent DataFrame """ # read the pregnancy frame preg = nsfg.ReadFemPreg() # make the map from caseid to list of pregnancy indices preg_map = nsfg.MakePregMap(preg) # iterate through the preg_map for caseid, indices in preg_map.items(): row = resp[resp.caseid == caseid] pregnum = row.pregnum.values[0] # check that pregnum from the respondent file equals # the number of records in the pregnancy file if len(indices) != pregnum: print(caseid, len(indices), resp.pregnum[index]) return False return True
def ValidatePregnum(resp): """Validate pregnum in the respondent file. resp: respondent DataFrame """ # read the pregnancy frame preg = nsfg.ReadFemPreg() # make the map from caseid to list of pregnancy indices preg_map = nsfg.MakePregMap(preg) # iterate through the respondent pregnum series for index, pregnum in resp.pregnum.items(): caseid = resp.caseid[index] indices = preg_map[caseid] # check that pregnum from the respondent file equals # the number of records in the pregnancy file if len(indices) != pregnum: print(caseid, len(indices), pregnum) return False return True
def main(script): """Tests the functions in this module. script: string script name """ # read and validate the respondent file resp = ReadFemResp() assert (len(resp) == 7643) # read and validate the pregnancy file preg = nsfg.ReadFemPreg() print(preg.shape) assert len(preg) == 13593 # validate that the pregnum column in `resp` matches the number # of entries in `preg` assert (ValidatePregnum(resp, preg)) PrintPregNums(preg) print('%s: All tests passed.' % script)
def main(): thinkstats2.RandomSeed(17) preg = nsfg.ReadFemPreg() sf1 = PlotPregnancyData(preg) # make the plots based on Cycle 6 resp6 = ReadFemResp2002() sf2 = PlotMarriageData(resp6) ResampleSurvival(resp6) PlotRemainingLifetime(sf1, sf2) # read Cycles 5 and 7 resp5 = ReadFemResp1995() resp7 = ReadFemResp2010() # plot resampled survival functions by decade resps = [resp5, resp6, resp7] PlotResampledByDecade(resps) thinkplot.Save(root='survival4', xlabel='age (years)', ylabel='prob unmarried', xlim=[13, 45], ylim=[0, 1], formats=FORMATS) # plot resampled survival functions by decade, with predictions PlotResampledByDecade(resps, predict_flag=True, omit=[5]) thinkplot.Save(root='survival5', xlabel='age (years)', ylabel='prob unmarried', xlim=[13, 45], ylim=[0, 1], formats=FORMATS)
def myNSFG(): df = nsfg.ReadFemPreg() # print(df.columns) print(df.info()) df["agepreg"] /= 100 na_vals = [97, 98, 99] df["birthwgt_lb"].replace(na_vals, np.nan, inplace=True) df["birthwgt_oz"].replace(na_vals, np.nan, inplace=True) df["totalwgt_lb"] = df["birthwgt_lb"] + df["birthwgt_oz"] / 16.0 # print(pregordr.describe()) # print(df["birthwgt_lb"].describe()) # print(df["birthwgt_lb"].isnull()) # print(df["birthwgt_lb"].value_counts(sort=False)) # print(df["outcome"].value_counts().sort_index()) # print(df["birthwgt_lb"].value_counts().sort_index()) d = defaultdict(list) for index, caseid in df.caseid.iteritems(): d[caseid].append(index) caseid = 10229 indices = d[caseid] print(df.outcome[indices].values)
"""Use the dict returned by MakePregMap to validate resp: dataframe with nsfg respondents preg: dataframe with nsfg pregnancies """ dict_preg = nsfg.MakePregMap(preg) validatecases = [] for key, value in dict_preg.items(): preg_val = resp.loc[resp.caseid == key, 'pregnum'].values[0] - len(value) if preg_val != 0: validatecases.append(key) print(len(validatecases)) if __name__ == '__main__': main(*sys.argv) resp = nsfg.ReadFemResp() preg = nsfg.ReadFemPreg() #part one #print(resp.pregnum.value_counts().sort_index()) #part two first attempt #CrossValidatePregnum(resp, preg) #part three with dict CrossValPythonically(resp, preg)
# Author: Matt Xiao # Data frame from ThinkStats import nsfg import thinkstats2 import thinkplot df = nsfg.ReadFemPreg() hist = thinkstats2.Hist([1, 2, 2, 3, 5]) thinkplot.Hist(hist) thinkplot.Show(xlabel='value', ylabel='frequency')
def main(script): preg = nsfg.ReadFemPreg() # DataFrame resp = nsfg.ReadFemResp() # DataFrame preg_by_caseid = MakePregMap(preg) # dictionary total_pregnancies_by_caseid = {} for key, preg_list in preg_by_caseid.items(): total_preg = 0 for p in preg_list: total_preg += 1 total_pregnancies_by_caseid[key] = total_preg print(len(total_pregnancies_by_caseid)) print(len(resp)) # for k, v in total_pregnancies_by_caseid.items(): # print(k,v) # # iterate through the respondent pregnum series # for index, pregnum in resp.pregnum.iteritems(): # caseid = resp.caseid[index] # indices = total_pregnancies_by_caseid[caseid] # # check that pregnum from the respondent file equals # # the number of records in the pregnancy file # if indices != pregnum: # print(caseid, indices, pregnum) # preg_pregnum = pd.DataFrame([total_pregnancies_by_caseid], columns=['caseid', 'pregnum']) # result = ValidatePregnum(resp, preg_pregnum) # print(result) # df = ReadFemResp() # print(df.pregnum.head()) # print(df.pregnum.value_counts().sort_index()) # bins = [0,1,2,3,4,5,6,100] # print(pd.cut(df.pregnum, bins).value_counts().sort_index()) # preg = nsfg.ReadFemPreg() # resp = nsfg.ReadFemResp() # print(ValidatePregnum(resp, preg)) # print(preg.head()) # pregnum_map = nsfg.MakePregMap(preg) # # pprint(pregnum_map) # print(len(pregnum_map)) # print(len(resp)) # for key, value in pregnum_map.items(): # pass # print(key, len(value)) # print(type(resp.pregnum[key])) # if resp.pregnum[key] == len(value): # print("MATCH") # elif resp.pregnum[key] != len(value): # print("NO MATCH") # else: # print("ERROR") # print(pregnum_map) # print(resp.pregnum) # caseid = 12556 # pregnum_map = nsfg.MakePregMap(preg) # indices = pregnum_map[caseid] # # resp.pregnum[indices].values # result = preg.pregnum # print(result) # print(resp.head()) # print(result) """Tests the functions in this module.
def MakeDataFrames(): preg = nsfg.ReadFemPreg() live = preg[preg.outcome == 1] first = live[live.birthord == 1] other = live[live.birthord != 1] return (live, first, other)
def main(): df = ReadData() print(df) df = nsfg.ReadFemPreg() print(df.shape)
# coding: utf-8 # In[1]: from __future__ import print_function, division import nsfg # In[2]: nsfg.ReadFemPreg() pres = nsfg.ReadFemPreg() # In[3]: pres # In[4]: pres.head(20) # In[5]: pres.tail(30) # In[6]: print("There are", len(pres.index), "rows and", len(pres.columns), "columns.") # In[7]:
#import pandas as pd import random import scipy.stats import sys directory = '/Users/warren/Data_Science/metis/github/prework/dsp/ThinkStats2/code/' sys.path.append(directory) import nsfg import thinkstats2 import thinkplot # Q1 # Would like to have a concise/readable way to perform these data manipulations # without the need to define the intermediate "live" table, as done here. preg = nsfg.ReadFemPreg(dct_file=directory + '2002FemPreg.dct', dat_file=directory + '2002FemPreg.dat.gz') live = preg[preg.outcome == 1] firsts = live[live.birthord == 1]['totalwgt_lb'] others = live[live.birthord != 1]['totalwgt_lb'] # This was defined in the book, but it's not in nsfg, so I copied and pasted the # code here. I can understand its contents. def CohenEffectSize(group1, group2): diff = group1.mean() - group2.mean() var1 = group1.var() var2 = group2.var() n1, n2 = len(group1), len(group2)
def MakeDataframes(): preg = nsfg.ReadFemPreg(); live = preg[preg.outcome == 1]; first = live[live.birthord == 1] ; other = live[live.birthord != 1] ; return (live,first,other) ;
def ValueCounts(resp): preg = nsfg.ReadFemPreg() preg_map = nsfg.MakePregMap(preg) preg.pregnum_counts().sort_index()
def validate(): resp = ReadFemResp() pregs = sum(resp.pregnum.value_counts()) preg = nsfg.ReadFemPreg()
def main(script): preg = nsfg.ReadFemPreg() print(preg.pregnum.value_counts().sort_index()) pregMap = nsfg.MakePregMap(preg)