Exemplo n.º 1
0
def ValidatePregnum(resp):
    """Validate pregnum in the respondent file.

    resp: respondent DataFrame
    """
    # read the pregnancy frame
    preg = nsfg.ReadFemPreg()

    # make the map from caseid to list of pregnancy indices
    preg_map = nsfg.MakePregMap(preg)

    # iterate through the preg_map
    for caseid, indices in preg_map.items():
        row = resp[resp.caseid == caseid]
        pregnum = row.pregnum.values[0]

        # check that pregnum from the respondent file equals
        # the number of records in the pregnancy file
        if len(indices) != pregnum:
            print(caseid, len(indices), resp.pregnum[index])
            return False

    return True
Exemplo n.º 2
0
def ValidatePregnum(resp):
    """Validate pregnum in the respondent file.

    resp: respondent DataFrame
    """
    # read the pregnancy frame
    preg = nsfg.ReadFemPreg()

    # make the map from caseid to list of pregnancy indices
    preg_map = nsfg.MakePregMap(preg)

    # iterate through the respondent pregnum series
    for index, pregnum in resp.pregnum.items():
        caseid = resp.caseid[index]
        indices = preg_map[caseid]

        # check that pregnum from the respondent file equals
        # the number of records in the pregnancy file
        if len(indices) != pregnum:
            print(caseid, len(indices), pregnum)
            return False

    return True
Exemplo n.º 3
0
def main(script):
    """Tests the functions in this module.

    script: string script name
    """
    # read and validate the respondent file
    resp = ReadFemResp()

    assert (len(resp) == 7643)

    # read and validate the pregnancy file
    preg = nsfg.ReadFemPreg()
    print(preg.shape)

    assert len(preg) == 13593

    # validate that the pregnum column in `resp` matches the number
    # of entries in `preg`
    assert (ValidatePregnum(resp, preg))

    PrintPregNums(preg)

    print('%s: All tests passed.' % script)
Exemplo n.º 4
0
def main():
    thinkstats2.RandomSeed(17)

    preg = nsfg.ReadFemPreg()
    sf1 = PlotPregnancyData(preg)

    # make the plots based on Cycle 6
    resp6 = ReadFemResp2002()

    sf2 = PlotMarriageData(resp6)

    ResampleSurvival(resp6)

    PlotRemainingLifetime(sf1, sf2)

    # read Cycles 5 and 7
    resp5 = ReadFemResp1995()
    resp7 = ReadFemResp2010()

    # plot resampled survival functions by decade
    resps = [resp5, resp6, resp7]
    PlotResampledByDecade(resps)
    thinkplot.Save(root='survival4',
                   xlabel='age (years)',
                   ylabel='prob unmarried',
                   xlim=[13, 45],
                   ylim=[0, 1],
                   formats=FORMATS)

    # plot resampled survival functions by decade, with predictions
    PlotResampledByDecade(resps, predict_flag=True, omit=[5])
    thinkplot.Save(root='survival5',
                   xlabel='age (years)',
                   ylabel='prob unmarried',
                   xlim=[13, 45],
                   ylim=[0, 1],
                   formats=FORMATS)
Exemplo n.º 5
0
def myNSFG():
    df = nsfg.ReadFemPreg()
    # print(df.columns)
    print(df.info())

    df["agepreg"] /= 100
    na_vals = [97, 98, 99]
    df["birthwgt_lb"].replace(na_vals, np.nan, inplace=True)
    df["birthwgt_oz"].replace(na_vals, np.nan, inplace=True)
    df["totalwgt_lb"] = df["birthwgt_lb"] + df["birthwgt_oz"] / 16.0

    # print(pregordr.describe())
    # print(df["birthwgt_lb"].describe())
    # print(df["birthwgt_lb"].isnull())
    # print(df["birthwgt_lb"].value_counts(sort=False))
    # print(df["outcome"].value_counts().sort_index())
    # print(df["birthwgt_lb"].value_counts().sort_index())

    d = defaultdict(list)
    for index, caseid in df.caseid.iteritems():
        d[caseid].append(index)
    caseid = 10229
    indices = d[caseid]
    print(df.outcome[indices].values)
Exemplo n.º 6
0
    """Use the dict returned by MakePregMap to validate

    resp: dataframe with nsfg respondents
    preg: dataframe with nsfg pregnancies
    """
    dict_preg = nsfg.MakePregMap(preg)
    validatecases = []
    for key, value in dict_preg.items():
        preg_val = resp.loc[resp.caseid == key,
                            'pregnum'].values[0] - len(value)
        if preg_val != 0:
            validatecases.append(key)

    print(len(validatecases))


if __name__ == '__main__':
    main(*sys.argv)

    resp = nsfg.ReadFemResp()
    preg = nsfg.ReadFemPreg()

    #part one
    #print(resp.pregnum.value_counts().sort_index())

    #part two first attempt
    #CrossValidatePregnum(resp, preg)

    #part three with dict
    CrossValPythonically(resp, preg)
Exemplo n.º 7
0
# Author: Matt Xiao
# Data frame from ThinkStats

import nsfg
import thinkstats2
import thinkplot

df = nsfg.ReadFemPreg()


hist = thinkstats2.Hist([1, 2, 2, 3, 5])

thinkplot.Hist(hist)
thinkplot.Show(xlabel='value', ylabel='frequency')



Exemplo n.º 8
0
def main(script):

    preg = nsfg.ReadFemPreg()  # DataFrame
    resp = nsfg.ReadFemResp()  # DataFrame
    preg_by_caseid = MakePregMap(preg)  # dictionary

    total_pregnancies_by_caseid = {}
    for key, preg_list in preg_by_caseid.items():
        total_preg = 0
        for p in preg_list:
            total_preg += 1
        total_pregnancies_by_caseid[key] = total_preg

    print(len(total_pregnancies_by_caseid))
    print(len(resp))
    # for k, v in total_pregnancies_by_caseid.items():
    #     print(k,v)

    # # iterate through the respondent pregnum series
    # for index, pregnum in resp.pregnum.iteritems():
    #     caseid = resp.caseid[index]
    #     indices = total_pregnancies_by_caseid[caseid]


    #     # check that pregnum from the respondent file equals
    #     # the number of records in the pregnancy file
    #     if indices != pregnum:
    #         print(caseid, indices, pregnum)


    # preg_pregnum = pd.DataFrame([total_pregnancies_by_caseid], columns=['caseid', 'pregnum'])
    # result = ValidatePregnum(resp, preg_pregnum)
    # print(result)

    # df = ReadFemResp()
    # print(df.pregnum.head())
    # print(df.pregnum.value_counts().sort_index())

    # bins = [0,1,2,3,4,5,6,100]
    # print(pd.cut(df.pregnum, bins).value_counts().sort_index())

    # preg = nsfg.ReadFemPreg()
    # resp = nsfg.ReadFemResp()
    # print(ValidatePregnum(resp, preg))
    # print(preg.head())

    # pregnum_map = nsfg.MakePregMap(preg)
    # # pprint(pregnum_map)


    # print(len(pregnum_map))
    # print(len(resp))

    # for key, value in pregnum_map.items():
    #     pass
        # print(key, len(value))
        # print(type(resp.pregnum[key]))
        # if resp.pregnum[key] == len(value):
        #     print("MATCH")
        # elif resp.pregnum[key] != len(value):
        #     print("NO MATCH")
        # else:
        #     print("ERROR")
    # print(pregnum_map)



    # print(resp.pregnum)

    # caseid = 12556
    # pregnum_map = nsfg.MakePregMap(preg)
    # indices = pregnum_map[caseid]
    # # resp.pregnum[indices].values
    # result = preg.pregnum
    # print(result)
    # print(resp.head())


    # print(result)

    """Tests the functions in this module.
Exemplo n.º 9
0
def MakeDataFrames():
    preg = nsfg.ReadFemPreg()
    live = preg[preg.outcome == 1]
    first = live[live.birthord == 1]
    other = live[live.birthord != 1]
    return (live, first, other)
Exemplo n.º 10
0
def main():
    df = ReadData()
    print(df)
    df = nsfg.ReadFemPreg()
    print(df.shape)
Exemplo n.º 11
0
# coding: utf-8

# In[1]:

from __future__ import print_function, division

import nsfg

# In[2]:

nsfg.ReadFemPreg()
pres = nsfg.ReadFemPreg()

# In[3]:

pres

# In[4]:

pres.head(20)

# In[5]:

pres.tail(30)

# In[6]:

print("There are", len(pres.index), "rows and", len(pres.columns), "columns.")

# In[7]:
Exemplo n.º 12
0
#import pandas as pd
import random
import scipy.stats

import sys
directory = '/Users/warren/Data_Science/metis/github/prework/dsp/ThinkStats2/code/'
sys.path.append(directory)

import nsfg
import thinkstats2
import thinkplot

# Q1
# Would like to have a concise/readable way to perform these data manipulations
# without the need to define the intermediate "live" table, as done here.
preg = nsfg.ReadFemPreg(dct_file=directory + '2002FemPreg.dct',
                        dat_file=directory + '2002FemPreg.dat.gz')

live = preg[preg.outcome == 1]

firsts = live[live.birthord == 1]['totalwgt_lb']
others = live[live.birthord != 1]['totalwgt_lb']


# This was defined in the book, but it's not in nsfg, so I copied and pasted the
# code here. I can understand its contents.
def CohenEffectSize(group1, group2):
    diff = group1.mean() - group2.mean()

    var1 = group1.var()
    var2 = group2.var()
    n1, n2 = len(group1), len(group2)
Exemplo n.º 13
0
def MakeDataframes():
    preg = nsfg.ReadFemPreg();
    live = preg[preg.outcome == 1];
    first = live[live.birthord == 1] ;
    other = live[live.birthord != 1] ;
    return (live,first,other) ;
Exemplo n.º 14
0
def ValueCounts(resp):
    preg = nsfg.ReadFemPreg()
    preg_map = nsfg.MakePregMap(preg)
    preg.pregnum_counts().sort_index()
Exemplo n.º 15
0
def validate():
    resp = ReadFemResp()
    pregs = sum(resp.pregnum.value_counts())

    preg = nsfg.ReadFemPreg()
Exemplo n.º 16
0
def main(script):
    preg = nsfg.ReadFemPreg()
    
    print(preg.pregnum.value_counts().sort_index())
    
    pregMap = nsfg.MakePregMap(preg)