def main(script): """Tests the functions in this module. script: string script name """ resp = ReadFemResp() pregnum = resp.pregnum # compare value counts with codebook # https://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=FEM§ion=R&subSec=7869&srtLabel=606835 pregnum_value_counts = pregnum.value_counts().sort_index() assert (pregnum_value_counts[0] == 2610) assert (pregnum_value_counts[1] == 1267) assert (pregnum_value_counts[2] == 1432) assert (pregnum_value_counts[3] == 1110) assert (pregnum_value_counts[4] == 611) assert (pregnum_value_counts[5] == 305) assert (pregnum_value_counts[6] == 150) assert (pregnum_value_counts[7:].sum() == 158) # cross-validation preg = nsfg.ReadFemPreg() # validate total counts with number ob preg records. assert (pregnum.sum() == len(preg)) # validate each casies preg_map = nsfg.MakePregMap(preg) for index, pregnum in resp.pregnum.items(): caseid = resp.caseid[index] indices = preg_map[caseid] assert (pregnum == len(indices)) print('%s: All tests passed.' % script)
def ValidatePregnum(resp): """Validate pregnum in the respondent file. resp: respondent DataFrame """ #READ PREGNANCY DATAFRAME #REFERRED TO EX 1-1 IN [3] resp = nsfg.ReadFemPreg() # make the map from caseid to list of pregnancy indices #GIVEN IN EXERCISE INSTRUCTIONS #REFERRED TO EX 1-1 IN [13] preg_map = nsfg.MakePregMap(preg) # iterate through the respondent pregnum series for index, pregnum in resp.pregnum.iteritems(): caseid = resp.caseid[index] indices = preg_map[caseid] # check that pregnum from the respondent file equals # the number of records in the pregnancy file if len(indices) != pregnum: print(caseid, len(indices), pregnum) return False return True
def ValidatePregnum(resp): """Validate pregnum in the respondent file. resp: respondent DataFrame """ # make a dictionary that maps from caseid to respondent index d = {} for index, caseid in resp.caseid.iteritems(): d[caseid] = index # read the pregnancy frame preg = nsfg.ReadFemPreg() # make the map from caseid to list of pregnancy indices preg_map = nsfg.MakePregMap(preg) # iterate through the preg_map for caseid, indices in preg_map.items(): row = resp[resp.caseid == caseid] pregnum = row.pregnum.values[0] # check that pregnum from the respondent file equals # the number of records in the pregnancy file if len(indices) != pregnum: print(caseid, len(indices), resp.pregnum[index]) return False return True
def ValidatePregnum(resp): """Validate pregnum in the respondent file. resp: respondent DataFrame """ # read the pregnancy frame preg = nsfg.ReadFemPreg() # [RW: Ok, my suppostion is that resp is the full femResp2002 df, # and preg is the full femPreg2002 df (per my file chap01_2_1ex.ipynb).] # make the map from caseid to list of pregnancy indices preg_map = nsfg.MakePregMap(preg) # iterate through the respondent pregnum series # 7643 records # Each record is a unique caseid and each pregnum is the total number of # corresponding pregnancies. for index, pregnum in resp.pregnum.items(): caseid = resp.caseid[index] # getting the corresponding caseid for this/each record. indices = preg_map[caseid] # check that pregnum from the respondent file equals # the number of records in the pregnancy file # [RW: This implementation can only identify the first inequality and then stops. # Would be better to keep going and identify all inequalities. But the # inference is that in terms of cross-validation, if there is even a single discrepancy, # then the data is not validated. Perhaps patterns can still be identified this way. # It's a start.] if len(indices) != pregnum: print(caseid, len(indices), pregnum) return False return True # pregnum data is cross-validated.
def main(): resp = nsfg.ReadFemResp() preg = nsfg.ReadFemPreg() preg_map = nsfg.MakePregMap(preg) for id, pregnum_value in resp.pregnum.items(): caseid = resp.caseid[id] assert (pregnum_value == len(preg_map[caseid])) print('Success')
def cross_validate_with_preg(preg_df, resp_df): preg_map = nsfg.MakePregMap(preg_df) for preg_caseid, preg_idxs in preg_map.items(): resp_pregnum = resp_df[resp_df.caseid == preg_caseid].pregnum.iloc[0] if resp_pregnum != len(preg_idxs): print(preg_caseid, preg_idxs[preg_caseid]) return False return True
def PairWiseDifference(live): live = live[live.prglngth >= 37] preg_map = nsfg.MakePregMap(live) diffs = [] for caseid, indicies in preg_map.items(): lengths = live.loc[indicies].prglngth.values if len(lengths) >= 2: diffs.extend(Diffs(lengths)) return diffs
def validate(resp, preg): preg_map = nsfg.MakePregMap(preg) for index, pregnum in resp.pregnum.items(): caseidresp = resp.caseid[index] indices = preg_map[caseidresp] if len(indices) != pregnum: print(resp[index]) return False return True
def ValidatePregnum(resp, preg): respToPregMap = nsfg.MakePregMap(preg) for index, pregnum in resp.pregnum.iteritems(): caseid = resp.caseid[index] pregCount = len(respToPregMap[caseid]) if pregCount != pregnum: print(caseid, pregCount, pregnum) return False return True
def CrossValidate(resp): preg = nsfg.ReadFemPreg() pregMap = nsfg.MakePregMap(preg) for caseid in resp['caseid']: if (len(pregMap[caseid]) != int(resp.loc[resp['caseid'] == caseid].pregnum)): print ("Test failed on caseid " + str(caseid) + ", pregMapNum: " + str(len(pregMap[caseid])) + ", respNum: " + str(resp.pregnum[caseid])) return False return True
def PairwiseDiff(live): preg_map = nsfg.MakePregMap(live) diffs = [] for caseid, children in preg_map.items(): if len(children) >= 2: preg_lengths = live.loc[children].prglngth.values first = preg_lengths[0] rest = preg_lengths[1:] diffs.extend([first - r for r in rest]) return diffs
def ValidatePregnum(resp): preg = nsfg.ReadFemPreg() preg_map = nsfg.MakePregMap(preg) for index, pregnum in resp.pregnum.iteritems(): caseid = resp.caseid[index] indices = preg_map[caseid] if len(indices) != pregnum: print(caseid, len(indices), pregnum) return False return True
def ValidatePregnum(dctpreg, dctresp, pregfile, respfile): """ Validate preg and resp in term of pregnum""" preg = ReadFemFile(dctpreg, pregfile) resp = ReadFemFile(dctresp, respfile) # caseid -> list m = nsfg.MakePregMap(preg) # Iterate over map keys for caseid in m: pregnumFromPreg = len(m[caseid]) pregnumFromResp = list(resp[resp.caseid == caseid].pregnum)[0] assert pregnumFromResp == pregnumFromPreg
def main(script): """Code used while developing Chapter 1. script: string script name """ preg = nsfg.ReadFemPreg() preg_map = nsfg.MakePregMap(preg) # print the sequence of outcomes for one caseid caseid = 10229 indices = preg_map[caseid] print(caseid, preg.outcome[indices].values)
def main(script): """Tests the functions in this module. script: string script name """ respdata=read_fem_resp() pregdata=nsfg.ReadFemPreg() preg_index_dict = nsfg.MakePregMap(pregdata) for (caseid, indexes) in preg_index_dict.items(): assert len(indexes) == respdata.pregnum[respdata.caseid == caseid].values print('%s: All tests passed.' % script)
def ValidatePregnum(resp, preg): """make the map from caseid to list of pregnancy indices""" preg_map = nsfg.MakePregMap(preg) for k, v in resp.pregnum.iteritems(): # iterate over a pandas series caseid = resp.caseid[k] # assign the caseid value for the current key indices = preg_map[caseid] # get the list of rows for the case id # check if the number of entries is equal to # the resp.pregnum value if len(indices) != v: print(caseid, len(indices), v) return False return True
def validate(resp): assert len(resp.index) == 13593 assert resp.pregnum.value_counts(1) == 1267 preg_map = nsfg.MakePregMap(resp) preg_times_map = preg_times(preg_map) for caseid, indicies in preg_map.items(): pregnum_resp = resp.loc[indicies, 'pregnum'] assert len(pregnum_resp) == preg_times_map[caseid] caseid_resp = resp.caseid[indicies] assert caseid_resp.tolist() == preg_map[caseid]
def ValidatePregnum(femResp): # Get female pregnancy dataFrame femPreg = nsfg.ReadFemPreg() pregMap = nsfg.MakePregMap(femPreg) for index, pregnum in femResp.pregnum.items(): caseid = femResp.caseid[index] indices = pregMap[caseid] if len(indices) != pregnum: print(caseid, len(indices), pregnum) return False return True
def validatePregnum(resp): preg = nsfg.ReadFemPreg(dct_file='data/2002FemPreg.dct', dat_file='data/2002FemPreg.dat.gz') preg_map = nsfg.MakePregMap(preg) for index, pregnum in resp.pregnum.items(): caseid = resp.caseid[index] indices = preg_map[caseid] # check that pregnum from the respondent file equals # the number of records in the pregnancy file if len(indices) != pregnum: print(caseid, len(indices), pregnum) return False return True
def main(script): """Tests the functions in this module. script: string script name """ respdf=nsfg.ReadFemResp() #respdf.head() pregnum=respdf['pregnum'] """ The variable pregnum is a recode that indicates how many times each re- spondent has been pregnant. Print the value counts for this variable and compare them to the published results in the NSFG codebook: https://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=FEM§ion=R&subSec=7869&srtLabel=606835 """ preg_stat=pregnum.value_counts().sort_index() list_of_Npregs=pregnum.unique() list_of_Npregs.sort() preg_stat_nsfg=[] print("list_of_Npregs",list_of_Npregs) Npregs_7_95=0 Npregs_tot=0 for i in list_of_Npregs: Npregs_tot+=preg_stat[i] if i<7: preg_stat_nsfg.append((i,preg_stat[i])) if i>6: Npregs_7_95+=preg_stat[i] print("pregnums:") for i in range(0,len(preg_stat_nsfg)): print( preg_stat_nsfg[i][0]," ",preg_stat_nsfg[i][1]) print("7-95 ",Npregs_7_95,"\nTotal = ",Npregs_tot) """ cross-validate the respondent and pregnancy files by comparing pregnum for each respondent with the number of records in the pregnancy file. """ pregdf=nsfg.ReadFemPreg() map_ResptoPreg=nsfg.MakePregMap(pregdf) fail=0 for index, pregnum in respdf.pregnum.iteritems(): #print("index",index,"pregnum",pregnum) caseid = respdf.caseid[index] indices = map_ResptoPreg[caseid] if pregnum!=len(indices): print("caseid in resp:",caseid,", pregnum=",pregnum," entries in preg= ",indices) fail+=1 if fail==0: print('%s: All tests passed.' % script)
def CrossValPythonically(resp, preg): """Use the dict returned by MakePregMap to validate resp: dataframe with nsfg respondents preg: dataframe with nsfg pregnancies """ dict_preg = nsfg.MakePregMap(preg) validatecases = [] for key, value in dict_preg.items(): preg_val = resp.loc[resp.caseid == key, 'pregnum'].values[0] - len(value) if preg_val != 0: validatecases.append(key) print(len(validatecases))
def main(script): """Tests the functions in this module. script: string script name """ df = nsfg.ReadFemPreg() preg_map = nsfg.MakePregMap(df) preg_times_map = preg_times(preg_map) for k, v in preg_times_map.items(): print("subject %s was pregnant %s times" % (k, v)) resp = ReadFemResp() validate(resp) print('%s: All tests passed.' % script)
def PairWiseDifferences(live): live = live[live.prglngth >= 37] preg_map = nsfg.MakePregMap(live) diffs = [] for caseid, indices in preg_map.items(): lengths = live.loc[indices].prglngth.values if len(lengths) >= 2: diffs.extend(Diffs(lengths)) mean = thinkstats2.Mean(diffs) print('Mean difference between pairs', mean) pmf = thinkstats2.Pmf(diffs) thinkplot.Hist(pmf, align='center') thinkplot.Show(xlabel='Difference in weeks', ylabel='PMF')
def main(script): """Tests the functions in this module. script: string script name """ print('Running') resp = ReadFemResp() print(resp.pregnum.value_counts().sort_index()) preg = nsfg.ReadFemPreg() pm = nsfg.MakePregMap(preg) print('checking counts') for i, r in resp.caseid.head().iteritems(): if resp.pregnum[i] != len(pm[r]): print([r, resp[resp.caseid == r].pregnum.iloc[0], len(pm[r])])
def ValidateData(resp): df = nsfg.ReadFemPreg() df_map = nsfg.MakePregMap(df) # Iterate response data and compare for index, pregnancies in resp.pregnum.items(): caseid = resp.caseid[index] indexes = df_map[caseid] # Check the count from both source if len(indexes) != pregnancies: print(caseid, len(indexes), pregnancies) return False return True
def ValidatePregnum(resp): # read the pregnancy frame preg = nsfg.ReadFemPreg() # Uses .nsfg.makepregmap to make a dictionary that maps from each caseid to a list of idices into the pregnancy dataframe preg_map = nsfg.MakePregMap(preg) # iterate through the respondent pregnum series for index, pregnum in resp.pregnum.items(): caseid = resp.caseid[index] indices = preg_map[caseid] # check that pregnum from the respondent file equals # records in pregnancy files if len(indices) != pregnum: print(caseid, len(indices), pregnum) return False return True
def ValidatePregnum(respo): # reads the pregnancy frame preg = nsfg.ReadFemPreg() # make the map from caseid to list of pregnancy indices preg_map = nsfg.MakePregMap(preg) # iterate respondent pregnum series for index, pregnum in respo.pregnum.items(): caseid = respo.caseid[index] indices = preg_map[caseid] # check that pregnum from respondent file = pregnancy file if len(indices) != pregnum: print(caseid, len(indices), pregnum) return False return True
def main(script): """Tests the functions in this module. script: string script name """ print('%s: All tests passed.' % script) df = nsfg.ReadFemPreg() #print(df.pregnum) preg_map = nsfg.MakePregMap(df) dfr = ReadFemResp() resp_map = MakeRespMap(dfr) #print(preg_map) for caseid in sorted(preg_map): preg_count = len(preg_map[caseid]) index = resp_map[caseid] numpregs = dfr.numpregs[index] if preg_count != numpregs: print(caseid, preg_count, numpregs)
def validatePregnum(resp): # cross validate by numbers or records in preg file resp = nsfg.ReadFemResp() # caseid to list of pregnancy indices preg_map = nsfg.MakePregMap(nsfg.ReadFemPreg()) # iterate through the respondend pregnum series for index, pregnum in resp.pregnum.items(): caseid = resp.caseid[index] indices = preg_map[caseid] # check that pregnum from respondent file equals # number of records in preg file if len(indices) != pregnum: print(caseid, len(indices), pregnum) return False return True
def PairWiseDifferences(live): """Summarize pairwise differences for children of the same mother. live: DataFrame of pregnancy records for live births """ live = live[live.prglngth >= 37] preg_map = nsfg.MakePregMap(live) diffs = [] for caseid, indices in preg_map.items(): lengths = live.loc[indices].prglngth.values if len(lengths) >= 2: diffs.extend(Diffs(lengths)) mean = thinkstats2.Mean(diffs) print('Mean difference between pairs', mean) pmf = thinkstats2.Pmf(diffs) thinkplot.Hist(pmf, align='center') thinkplot.Show(xlabel='Difference in weeks', ylabel='PMF')