def download(overwrite_cached=False): settings = par.read_settings(str(_HERE_ / 'settings.json')) cached_dd = dl.check_cached(settings['dd_path'], kind='dictionary') cached_month = dl.check_cached(settings['monthly_path'], kind='data') dds = dl.all_monthly_files(kind='dictionary') dds = filter(itemgetter(1), dds) # make sure not None cpsdec! dds = dl.filter_dds(dds, months=[par._month_to_dd(settings['date_start']), par._month_to_dd(settings['date_end'])]) data = dl.all_monthly_files() data = dl.filter_monthly_files(data, months=[[settings['date_start'], settings['date_end']]]) if not overwrite_cached: def is_new(x, cache=None): return dl.rename_cps_monthly(x[1]) not in cache dds = filter(partial(is_new, cache=cached_dd), dds) data = filter(partial(is_new, cache=cached_month), data) for month, renamed in dds: dl.download_month(month, Path(settings['dd_path'])) logging.info("Downloaded {}".format(renamed)) for month, renamed in data: dl.download_month(month, Path(settings['monthly_path'])) logging.info("Downloaded {}".format(renamed))
def download(kind, settings, overwrite=False): """ Download files from NBER. kind : {'dictionary', 'data'} settings : dict overwrite : bool; default True Whether to overwrite existing files """ s_path = {'dictionary': 'dd_path', 'data': 'monthly_path'}[kind] cached = dl.check_cached(settings[s_path], kind=kind) files = dl.all_monthly_files(kind=kind) if kind == 'dictionary': files = filter(itemgetter(1), files) # make sure not None cpsdec! months = [par._month_to_dd(settings['date_start']), par._month_to_dd(settings['date_end'])] else: months = [[settings['date_start'], settings['date_end']]] files = dl.filter_monthly(files, months=months, kind=kind) def is_new(x, cache=None): return dl.rename_cps_monthly(x[1]) not in cache for month, renamed in files: if is_new((month, renamed), cache=cached) or overwrite: dl.download_month(month, Path(settings[s_path])) logger.info("Downloaded {}".format(renamed)) else: logger.info("Using cached {}".format(renamed))
def test_month_to_dd(self): months = ['1989-01', '1989-03', '1989-12', '1992-01', '1992-02', '1993-12', '1994-01', '1994-02', '1994-03', '1994-04', '1994-05', '1995-05', '1995-06', '1995-07', '1995-08', '1995-09', '1996-01', '1997-12', '1998-01', '2000-01', '2002-12', '2003-01', '2004-02', '2004-04', '2004-05', '2004-06', '2005-07', '2005-08', '2005-09', '2005-10', '2005-11', '2006-01', '2006-12', '2007-01', '2008-09', '2008-12', '2009-01', '2009-06', '2009-12', '2010-01', '2010-11', '2012-02', '2012-05', '2012-07', '2012-12', '2013-01', '2013-02', '2013-03' ] dds = ["cpsm1989-01", "cpsm1992-01", "cpsm1994-01", "cpsm1994-04", "cpsm1995-06", "cpsm1995-09", "cpsm1998-01", "cpsm2003-01", "cpsm2004-05", "cpsm2005-08", "cpsm2005-11", "cpsm2007-01", "cpsm2009-01", "cpsm2010-01", "cpsm2012-05", "cpsm2013-01"] * 3 dds = sorted(dds) for month, dd in zip(months, dds): result = p._month_to_dd(month) self.assertEqual(result, dd)
def parse(): settings_file = str(_HERE_ / 'settings.json') settings = par.read_settings(settings_file) dd_path = Path(settings['dd_path']) dds = [x for x in dd_path.iterdir() if x.suffix in ('.ddf', '.asc')] monthly_path = Path(settings['monthly_path']) months = [x for x in monthly_path.iterdir() if x.suffix in ('.Z', '.zip')] settings['raise_warnings'] = False logger.info("Reading Data file") with (_HERE_ / 'data.json').open() as f: data = json.load(f) id_cols = ['HRHHID', 'HRHHID2', 'PULINENO'] dds = dds + [_HERE_ / Path('cpsm2014-01.ddf')] for dd in dds: parser = par.DDParser(dd, settings) df = parser.run() parser.write(df) logging.info("Added {} to {}".format(dd, parser.store_path)) for month in months: dd_name = par._month_to_dd(str(month)) store_path = settings['monthly_store'] dd = pd.read_hdf(settings['dd_store'], key=dd_name) cols = data['columns_by_dd'][dd_name] sub_dd = dd[dd.id.isin(cols)] if len(cols) != len(sub_dd): missing = set(cols) - set(sub_dd.id.values) raise ValueError("IDs {} are not in the Data " "Dictionary".format(missing)) with pd.get_store(store_path) as store: try: cached_cols = store.select(month.stem).columns newcols = set(cols) - set(cached_cols) - set(id_cols) if len(newcols) == 0: logger.info("Using cached {}".format(month.stem)) continue except KeyError: pass # Assuming no new rows df = par.read_monthly(str(month), sub_dd) df = par.fixup_by_dd(df, dd_name) # do special stuff like compute HRHHID2, bin things, etc. # TODO: special stuff df = df.set_index(id_cols) par.write_monthly(df, store_path, month.stem) logging.info("Added {} to {}".format(month, settings['monthly_store']))
def test_month_to_dd(self): months = [ "1989-01", "1989-03", "1989-12", "1992-01", "1992-02", "1993-12", "1994-01", "1994-02", "1994-03", "1994-04", "1994-05", "1995-05", "1995-06", "1995-07", "1995-08", "1995-09", "1996-01", "1997-12", "1998-01", "2000-01", "2002-12", "2003-01", "2004-02", "2004-04", "2004-05", "2004-06", "2005-07", "2005-08", "2005-09", "2005-10", "2005-11", "2006-01", "2006-12", "2007-01", "2008-09", "2008-12", "2009-01", "2009-06", "2009-12", "2010-01", "2010-11", "2012-02", "2012-05", "2012-07", "2012-12", "2013-01", "2013-02", "2013-03", ] dds = [ "cpsm1989-01", "cpsm1992-01", "cpsm1994-01", "cpsm1994-04", "cpsm1995-06", "cpsm1995-09", "cpsm1998-01", "cpsm2003-01", "cpsm2004-05", "cpsm2005-08", "cpsm2005-11", "cpsm2007-01", "cpsm2009-01", "cpsm2010-01", "cpsm2012-05", "cpsm2013-01", ] * 3 dds = sorted(dds) for month, dd in zip(months, dds): result = p._month_to_dd(month) self.assertEqual(result, dd)
def parse(kind, settings, overwrite=False): """ Parse downloaded files, store in HDFStore. Parameters ---------- kind : {'dictionary', 'data'} settings : dict overwrite : bool """ with open(settings['info_path']) as f: info = json.load(f) s_path = {'dictionary': 'dd_path', 'data': 'monthly_path'}[kind] path_ = Path(settings[s_path]) if not path_.exists(): path_.mkdir(parents=True) suffix_d = {'data': ('.Z', '.zip'), 'dictionary': ('.ddf', '.asc', '.txt')} suffixes = suffix_d[kind] files = [x for x in path_.iterdir() if x.suffix in suffixes] id_cols = ['HRHHID', 'HRHHID2', 'PULINENO'] with open(settings['info']) as f: data = json.load(f) if kind == 'dictionary': files.append(_HERE_ / Path('cpsm2014-01.ddf')) for f in files: parser = par.DDParser(f, settings, info) df = parser.run() parser.write(df) logging.info("Added {} to {}".format(f, parser.store_path)) else: for f in files: dd_name = par._month_to_dd(str(f)) store_path = settings['monthly_store'] dd = pd.read_hdf(settings['dd_store'], key=dd_name) cols = data['columns_by_dd'][dd_name] sub_dd = dd[dd.id.isin(cols)] if len(cols) != len(sub_dd): missing = set(cols) - set(sub_dd.id.values) raise ValueError("IDs {} are not in the Data " "Dictionary".format(missing)) with pd.get_store(store_path) as store: try: cached_cols = store.select(f.stem).columns newcols = set(cols) - set(cached_cols) - set(id_cols) if len(newcols) == 0: logger.info("Using cached {}".format(f.stem)) continue except KeyError: pass # Assuming no new rows df = par.read_monthly(str(f), sub_dd) fixups = settings['FIXUP_BY_DD'].get(dd_name) logger.info("Applying {} to {}".format(fixups, f.stem)) df = par.fixup_by_dd(df, fixups) # TODO: special stuff df = df.set_index(id_cols) par.write_monthly(df, store_path, f.stem) logging.info("Added {} to {}".format(f, settings['monthly_store']))