def download(overwrite_cached=False): settings = par.read_settings(str(_HERE_ / 'settings.json')) cached_dd = dl.check_cached(settings['dd_path'], kind='dictionary') cached_month = dl.check_cached(settings['monthly_path'], kind='data') dds = dl.all_monthly_files(kind='dictionary') dds = filter(itemgetter(1), dds) # make sure not None cpsdec! dds = dl.filter_dds(dds, months=[par._month_to_dd(settings['date_start']), par._month_to_dd(settings['date_end'])]) data = dl.all_monthly_files() data = dl.filter_monthly_files(data, months=[[settings['date_start'], settings['date_end']]]) if not overwrite_cached: def is_new(x, cache=None): return dl.rename_cps_monthly(x[1]) not in cache dds = filter(partial(is_new, cache=cached_dd), dds) data = filter(partial(is_new, cache=cached_month), data) for month, renamed in dds: dl.download_month(month, Path(settings['dd_path'])) logging.info("Downloaded {}".format(renamed)) for month, renamed in data: dl.download_month(month, Path(settings['monthly_path'])) logging.info("Downloaded {}".format(renamed))
def parse(): settings_file = str(_HERE_ / 'settings.json') settings = par.read_settings(settings_file) dd_path = Path(settings['dd_path']) dds = [x for x in dd_path.iterdir() if x.suffix in ('.ddf', '.asc')] monthly_path = Path(settings['monthly_path']) months = [x for x in monthly_path.iterdir() if x.suffix in ('.Z', '.zip')] settings['raise_warnings'] = False logger.info("Reading Data file") with (_HERE_ / 'data.json').open() as f: data = json.load(f) id_cols = ['HRHHID', 'HRHHID2', 'PULINENO'] dds = dds + [_HERE_ / Path('cpsm2014-01.ddf')] for dd in dds: parser = par.DDParser(dd, settings) df = parser.run() parser.write(df) logging.info("Added {} to {}".format(dd, parser.store_path)) for month in months: dd_name = par._month_to_dd(str(month)) store_path = settings['monthly_store'] dd = pd.read_hdf(settings['dd_store'], key=dd_name) cols = data['columns_by_dd'][dd_name] sub_dd = dd[dd.id.isin(cols)] if len(cols) != len(sub_dd): missing = set(cols) - set(sub_dd.id.values) raise ValueError("IDs {} are not in the Data " "Dictionary".format(missing)) with pd.get_store(store_path) as store: try: cached_cols = store.select(month.stem).columns newcols = set(cols) - set(cached_cols) - set(id_cols) if len(newcols) == 0: logger.info("Using cached {}".format(month.stem)) continue except KeyError: pass # Assuming no new rows df = par.read_monthly(str(month), sub_dd) df = par.fixup_by_dd(df, dd_name) # do special stuff like compute HRHHID2, bin things, etc. # TODO: special stuff df = df.set_index(id_cols) par.write_monthly(df, store_path, month.stem) logging.info("Added {} to {}".format(month, settings['monthly_store']))
def main(config): settings = par.read_settings(config.settings) overwrite = config.overwrite # Overwrite default info file? if config.info is not None: settings['info'] = config.info if config.monthly_data_fixups: import importlib fixup_file = config.monthly_data_fixups.strip('.py') user_fixups = importlib.import_module(fixup_file).FIXUP_BY_DD if config.append_fixups: # merge the user supplied with the defaults. from pycps.monthly_data_fixups import FIXUP_BY_DD for dd in FIXUP_BY_DD: new = user_fixups.get(dd) if new is not None: for x in new: FIXUP_BY_DD[dd].append(x) else: FIXUP_BY_DD = user_fixups.FIXUP_BY_DD else: from pycps.monthly_data_fixups import FIXUP_BY_DD # Fixups will be passed and accessed via settings settings['FIXUP_BY_DD'] = FIXUP_BY_DD if config.download_dictionaries: download('dictionary', settings, overwrite=overwrite) if config.download_monthly: download('data', settings, overwrite=overwrite) if config.parse_dictionaries: parse('dictionary', settings, overwrite=overwrite) if config.parse_monthly: parse('data', settings, overwrite=overwrite) if config.merge: merge(settings, overwrite=overwrite)
def test_substitue(self): result = p.read_settings(self.settings_file)["dd_path"] expected = "data/data_dictionaries/" self.assertEqual(result, expected)
def test_read_setting(self): result = p.read_settings(self.settings_file)["data_path"] expected = "data/" self.assertEqual(result, expected)
def setUp(self): self.settings = par.read_settings(mdir + "/pycps/settings.json")
def setUp(self): self.testfile = Path('files/cpsm2007-01.ddf') settings = p.read_settings(mdir + '/pycps/settings.json') with open(mdir + '/pycps/info.json') as f: info = json.load(f) self.parser = p.DDParser(self.testfile, settings, info)
def test_substitue(self): result = p.read_settings(self.settings_file)['dd_path'] expected = 'data/data_dictionaries/' self.assertEqual(result, expected)
def test_read_setting(self): result = p.read_settings(self.settings_file)['data_path'] expected = 'data/' self.assertEqual(result, expected)