示例#1
0
文件: api.py 项目: eXcuvator/pycps
def parse():
    settings_file = str(_HERE_ / 'settings.json')
    settings = par.read_settings(settings_file)

    dd_path = Path(settings['dd_path'])
    dds = [x for x in dd_path.iterdir() if x.suffix in ('.ddf', '.asc')]
    monthly_path = Path(settings['monthly_path'])
    months = [x for x in monthly_path.iterdir() if x.suffix in ('.Z', '.zip')]

    settings['raise_warnings'] = False

    logger.info("Reading Data file")
    with (_HERE_ / 'data.json').open() as f:
        data = json.load(f)

    id_cols = ['HRHHID', 'HRHHID2', 'PULINENO']
    dds = dds + [_HERE_ / Path('cpsm2014-01.ddf')]

    for dd in dds:
        parser = par.DDParser(dd, settings)
        df = parser.run()
        parser.write(df)
        logging.info("Added {} to {}".format(dd, parser.store_path))

    for month in months:
        dd_name = par._month_to_dd(str(month))
        store_path = settings['monthly_store']

        dd = pd.read_hdf(settings['dd_store'], key=dd_name)
        cols = data['columns_by_dd'][dd_name]
        sub_dd = dd[dd.id.isin(cols)]

        if len(cols) != len(sub_dd):
            missing = set(cols) - set(sub_dd.id.values)
            raise ValueError("IDs {} are not in the Data "
                             "Dictionary".format(missing))

        with pd.get_store(store_path) as store:
            try:
                cached_cols = store.select(month.stem).columns
                newcols = set(cols) - set(cached_cols) - set(id_cols)
                if len(newcols) == 0:
                    logger.info("Using cached {}".format(month.stem))
                    continue

            except KeyError:
                pass

        # Assuming no new rows
        df = par.read_monthly(str(month), sub_dd)
        df = par.fixup_by_dd(df, dd_name)
        # do special stuff like compute HRHHID2, bin things, etc.
        # TODO: special stuff

        df = df.set_index(id_cols)
        par.write_monthly(df, store_path, month.stem)
        logging.info("Added {} to {}".format(month, settings['monthly_store']))
示例#2
0
    def test_read_monthly(self):
        result = p.read_monthly(self.infile, self.dd)
        # expected = pd.DataFrame([['000000000000000', 11, 1999],
        #                          ['000000000000001', 12, 1999],
        #                          ['000000000000002', 1, 2000],
        #                          ['000000000000003', 2, 2000]],
        #                         columns=['HRHHID', 'HRMONTH', 'HRYEAR4'])
        expected = pd.DataFrame(
            [[0, 11, 1999], [1, 12, 1999], [2, 1, 2000], [3, 2, 2000]], columns=["HRHHID", "HRMONTH", "HRYEAR4"]
        )

        tm.assert_frame_equal(result, expected)
示例#3
0
    def test_read_monthly(self):
        result = p.read_monthly(self.infile, self.dd)
        # expected = pd.DataFrame([['000000000000000', 11, 1999],
        #                          ['000000000000001', 12, 1999],
        #                          ['000000000000002', 1, 2000],
        #                          ['000000000000003', 2, 2000]],
        #                         columns=['HRHHID', 'HRMONTH', 'HRYEAR4'])
        expected = pd.DataFrame([[0, 11, 1999],
                                 [1, 12, 1999],
                                 [2, 1, 2000],
                                 [3, 2, 2000]],
                                columns=['HRHHID', 'HRMONTH', 'HRYEAR4'])

        tm.assert_frame_equal(result, expected)
示例#4
0
def parse(kind, settings, overwrite=False):
    """
    Parse downloaded files, store in HDFStore.

    Parameters
    ----------

    kind : {'dictionary', 'data'}
    settings : dict
    overwrite : bool
    """
    with open(settings['info_path']) as f:
        info = json.load(f)

    s_path = {'dictionary': 'dd_path', 'data': 'monthly_path'}[kind]
    path_ = Path(settings[s_path])

    if not path_.exists():
        path_.mkdir(parents=True)

    suffix_d = {'data': ('.Z', '.zip'), 'dictionary': ('.ddf', '.asc', '.txt')}
    suffixes = suffix_d[kind]

    files = [x for x in path_.iterdir() if x.suffix in suffixes]
    id_cols = ['HRHHID', 'HRHHID2', 'PULINENO']

    with open(settings['info']) as f:
        data = json.load(f)

    if kind == 'dictionary':
        files.append(_HERE_ / Path('cpsm2014-01.ddf'))
        for f in files:
            parser = par.DDParser(f, settings, info)
            df = parser.run()
            parser.write(df)
            logging.info("Added {} to {}".format(f, parser.store_path))
    else:
        for f in files:
            dd_name = par._month_to_dd(str(f))
            store_path = settings['monthly_store']

            dd = pd.read_hdf(settings['dd_store'], key=dd_name)
            cols = data['columns_by_dd'][dd_name]
            sub_dd = dd[dd.id.isin(cols)]

            if len(cols) != len(sub_dd):
                missing = set(cols) - set(sub_dd.id.values)
                raise ValueError("IDs {} are not in the Data "
                                 "Dictionary".format(missing))

            with pd.get_store(store_path) as store:
                try:
                    cached_cols = store.select(f.stem).columns
                    newcols = set(cols) - set(cached_cols) - set(id_cols)
                    if len(newcols) == 0:
                        logger.info("Using cached {}".format(f.stem))
                        continue

                except KeyError:
                    pass

            # Assuming no new rows
            df = par.read_monthly(str(f), sub_dd)

            fixups = settings['FIXUP_BY_DD'].get(dd_name)
            logger.info("Applying {} to {}".format(fixups, f.stem))
            df = par.fixup_by_dd(df, fixups)
            # TODO: special stuff

            df = df.set_index(id_cols)
            par.write_monthly(df, store_path, f.stem)
            logging.info("Added {} to {}".format(f, settings['monthly_store']))