def parse_format_assignments(txt): assignments = thread_last( txt.split(';'), filter(lambda x: x.strip().lower().startswith('format')), mapcat(lambda x: x.lower().split('.')), map(lambda x: x.split()), # break out vars and format (mapcat, lambda y: [(k, y[-1]) for k in y]), # tuple of var, fmt dict ) return assignments
def parse_questions(txt): rqt = re.compile(r'[\"\']') # match quote chars assignments = thread_last( txt.split(';'), filter(lambda x: x.strip().lower().startswith('label')), mapcat(lambda x: x.lower().split('\n')), map(lambda x: x.split('=')), # break out vars and format (map, lambda y: (y[0].strip().lower(), rqt.sub('', y[1].strip()))), # tuple of var, fmt dict ) return assignments
def varlabels2df(vlbls, yr=None): return thread_last( vlbls.items(), map(lambda k, v: pd.DataFrame({'code': list(v.keys()), 'label': list(v.values()), 'var': k})), map(lambda df: df.assign(year=yr) if yr else df), pd.concat, lambda df: (df.set_index(['var', 'year', 'code']) if yr else df.set_index(['var', 'code'])) )
def parse_variable_labels(txt, repl, lbls_to_lower=True): b2d = curry(block2dict)(repl=repl, to_lower=lbls_to_lower) labels = thread_last( txt.split(';'), filter(lambda x: x.strip().lower().startswith('value')), map(lambda x: x.strip().split('\n')), map(lambda x: (x[0].split()[1].lower(), b2d(x[1:]))), dict ) logger.info('parsed varlabels from format txt', nlabeled=len(labels.keys()), nrepl=len(repl.keys())) return labels
def get_metadata_socrata_denovo(soc_cfg): g = soc_cfg revmap = {v: k for k, v in g.mapcols.items()} url = '{api_url}?' + \ '$select={cols}' + \ '&$order={ocols}' meta_diff = set(g.qn_meta).difference(g.computed) meta_diff = list(meta_diff) qncols = ','.join([(revmap[k] if k in revmap else k) for k in meta_diff]) ocols = ','.join([revmap['qid'], 'year']) logger.info('loading SODA meta data') res = thread_last( g.soda_api, map(lambda x: url.format(api_url=x, cols=qncols, ocols=ocols)), map(dl.df_from_socrata_url), curry(pd.concat)(ignore_index=True)) ''' lambda xf: xf.applymap(lambda x: (re.sub('\xa0', '', x)).strip()), lambda xf: xf.rename(index=str, columns={x: x.lower() for x in xf.columns}), lambda xf: xf if not g.mapcols else xf.rename(index=str, columns=g.mapcols), curry(apply_fn2vals)(fns=g.apply_fn), lambda xf: xf if not g.mapvals else xf.replace(g.mapvals), lambda xf: xf if not g.mapvals else xf.applymap(lambda x: g.mapvals[x.lower().strip()] if x.lower().strip() in g.mapvals else x), lambda xf: xf[g.qn_meta]) ''' logger.info('finished transformations', res=res.head()) # pull out question -> response breakouts qns = res[['qid', 'year', 'topic', 'subtopic', 'question', 'response']].drop_duplicates().reset_index(drop=True) # since facets are questions as well # update the dict with response value from fc_res # overriding the original var (N.B.) yrvec = (res[['year']] .drop_duplicates() .assign(facet='year') .rename(index=str, columns={'year': 'facet_level'})) stvec = (res[['sitecode']] .drop_duplicates() .assign(facet='sitecode') .rename(index=str, columns={'sitecode':'facet_level'})) facs = pd.concat( [res[['facet', 'facet_level']].drop_duplicates(), yrvec, stvec], axis=0).reset_index(drop=True) logger.info('created qn and facs', qn=qns.head(), fac=facs.head()) return (qns, facs)
def get_qids_by_year(soc_cfg): g = soc_cfg revmap = {v: k for k, v in g.mapcols.items()} url = '{api_url}?' + \ '$select=year,{qnkey},count(year)' + \ '&$group=year,{qnkey}' + \ '&$order={qnkey},year' qid = revmap['qid'] df = thread_last(g.soda_api, map(lambda x: url.format(api_url=x, qnkey=qid)), map(dl.df_from_socrata_url), curry(pd.concat)(ignore_index=True)) df.to_csv(sys.stdout)
def load_variable_labels(format_f, formas_f, repl, year=None): logger.info("loading format labels", file=format_f) labels = thread_last( format_f, dl.fetch_data_from_url, lambda x: x.read(), lambda t: (t.decode('utf-8', errors='ignore') if type(t) is bytes else t), curry(parse_variable_labels)(repl=repl) ) logger.info("loaded format labels", lbls=labels) logger.info("loading format assignments", file=formas_f) assignments = thread_last( formas_f, dl.fetch_data_from_url, lambda x: x.read(), lambda t: (t.decode('utf-8', errors='ignore') if type(t) is bytes else t), parse_format_assignments ) logger.info("loaded format assns", ass=assignments) return {k: labels[v] for k, v in assignments.items() if v in labels}
def block2dict(lines, repl, to_lower=False): f_lwr = str.lower if to_lower else identity f_repl = curry(lambda k, r: r[k] if k in r else k)(r=repl) rqt = re.compile(r'[\"\']') # match quote chars rws = re.compile(r'\s') # match whitespace # keep only alnum and a few unreserved symbols ruri = re.compile(r'(?![\w\s\-\_\.\'\$\-\+\(\)\/]|\.).') d = thread_last( lines, map(lambda x: x.replace('\x92', "'")), map(lambda x: rqt.sub('', x.strip()).split('=')), map(lambda x: (rws.sub('', x[0].strip()), ruri.sub('', x[1].strip()))), filter(lambda x: x[0].find('-') == -1), # no support for ranges (mapcat, lambda x: map(lambda y: (y, x[1]), x[0].split(','))), filter(lambda x: x[0].isnumeric()), # remove non-numeric codes map(lambda x: (int(x[0]), # cat codes are ints pipe(x[1], f_lwr, f_repl))), dict ) # d[-1] = np.nan #use NA as a marker for unmapped vals return d
def test_thread_last(): assert list(thread_last([1, 2, 3], (map, inc), (filter, iseven))) == [2, 4] assert list(thread_last([1, 2, 3], (map, inc), (filter, isodd))) == [3] assert thread_last(2, (add, 5), double) == 14
import us import pandas as pd import numpy as np from cytoolz.itertoolz import unique from cytoolz.functoolz import thread_last, identity from cytoolz.curried import map, filter, curry from survey_stats import pdutil # import sys # import traceback as tb from survey_stats import log logger = log.getLogger(__name__) US_STATES_FIPS_INTS = thread_last(us.STATES_AND_TERRITORIES, map(lambda x: x.fips), filter(lambda x: x is not None), map(lambda x: int(x)), list) SITECODE_TRANSLATORS = { 'fips': lambda x: (us.states.lookup('%.2d' % x).abbr if int(x) in US_STATES_FIPS_INTS else 'NA'), 'codes': identity } SVYDESIGN_COLS = ['sitecode', 'strata', 'psu', 'weight'] def convert_cat_codes(s, fmt): unq_lvls = list(unique([fmt[k] for k in sorted(fmt.keys())]))