def fetch_stats_totals(des, qn_f, r): total_ci = svyciprop_xlogit(Formula(qn_f), des, multicore=False) # extract stats logger.info('fetching stats totals', r=r, q=qn_f) cts = rsvy.svyby(Formula(qn_f), Formula(qn_f), des, rsvy.unwtd_count, na_rm=True, na_rm_by=True, na_rm_all=True, multicore=False) cts = pandas2ri.ri2py(cts) cols = ['eql', 'ct', 'se_ignore'] cts.columns = cols ct = cts.ct[cts.eql == 1].sum() ss = cts.ct.sum() res = {'level': 0, 'response': r, 'mean': u.guard_nan( rbase.as_numeric(total_ci)[0]) if total_ci else None, 'se': u.guard_nan( rsvy.SE(total_ci)[0]) if total_ci else None, 'ci_l': u.guard_nan( rbase.attr(total_ci, 'ci')[0]) if total_ci else None, 'ci_u': u.guard_nan( rbase.attr(total_ci, 'ci')[1]) if total_ci else None, 'count': ct, 'sample_size': ss } # round as appropriate logger.info('finished computation lvl1', res=res, total_ci=total_ci, ct=ct, ss=ss) res = pd.DataFrame([res]).round(DECIMALS) return u.fill_none(res)
def fetch_socrata(self, qn, vars, filt={}): vars = self.mapper(vars) filt = self.mapper(filt) sel = None df = self.soc fcts = list(set(self.meta.facets).intersection(self.soc.fields)) sel = df[ID_COLUMN] == qn site_col = self.mapper('sitecode') year_col = self.mapper('year') if site_col in filt.keys(): sel = sel & df[site_col].isin(filt[site_col]) elif site_col not in vars: sel = sel & (df[site_col] == 'XX') if year_col in filt.keys(): sel = sel & df[year_col].isin(filt[year_col]) elif year_col not in vars: sel = sel & (df[year_col] == 'Total') for v in fcts: if v in filt.keys(): sel = sel & df[v].isin(filt[v]) elif v not in vars: sel = sel & (df[v].isin(['Total', None])) cols = ( set(ANNO_COLUMNS).union(vars).union(STATS_COLUMNS).intersection( self.soc.fields)) cols = list(cols) dfz = odo(df[sel][cols], pd.DataFrame) stats_sub = list(set(STATS_COLUMNS).intersection(dfz.columns)) dfz[stats_sub] = dfz[stats_sub].apply( lambda xf: xf.astype(float).replace(-1.0, np.nan)) # logger.info('done filtering, replacing NaNs', dfz=dfz) return u.fill_none(dfz.round(DECIMALS).reset_index(drop=True))
def facet_map(self): facs = (self.flevels.groupby(['facet']).agg({ 'facet_level': lambda x: x.dropna().drop_duplicates().tolist() }).pipe(lambda xf: u.fill_none(xf)).to_dict(orient='index')) return pipe(facs, curry(valmap)(lambda x: x['facet_level']), curry(keyfilter)(lambda x: x != 'Overall'), lambda x: merge(x, self.flevels_r))
def fetch_stats(des, qn, r, vs=[], filt={}): # ex: ~qn8 rbase.gc() gc.collect() qn_f = '~I(%s=="%s")' % (qn, r) logger.info('subsetting des with filter', filt=filt) des = subset_survey(des, filt) logger.info('done subsetting') dfs = [fetch_stats_totals(des, qn_f, r)] levels = [vs[:k+1] for k in range(len(vs))] sts = map(lambda lvl: fetch_stats_by(des, qn_f, r, lvl), levels) dfz = pd.concat(dfs + sts, ignore_index=True) # get stats_by_fnats for each level of interactions in vars # using svyby to compute across combinations of loadings logger.info('finished computations, appending dfs', dfs=dfz) return u.fill_none(dfz) # .round(DECIMALS)
def questions(self): def get_first_aggval(xf): try: return xf.dropna().astype(str).get_values()[0] except Exception as e: return None # optional metadata to add to questions opts = (['year'] if 'year' in list(self.qns.columns) else []) + \ (['sitecode'] if 'sitecode' in list(self.qns.columns) else []) # columns to exclude before aggregating question metadata # consists of the groupby column, qid, plus unused optional cols qns = self.qns group_cols = [ID_COLUMN, 'topic', 'subtopic'] if self.qns_r is not None: qns['q_orig'] = qns.question.astype(str) qns['question'] = self.qns_r[qns.qid].reset_index(drop=True) qns['question'] = qns[['question', 'q_orig']].apply( lambda x: x.q_orig if pd.isnull(x.question) else x.question, axis=1) #incl_cols = incl_cols + ['question_orig'] #group_cols = group_cols + ['question_orig'] # columns that need to be uniqued -- otherwise assume constant for each qid uniq = ['year', 'sitecode', 'response'] dkeys = set(self.qns.columns).difference(group_cols + ['facet', 'facet_level']) # columns to aggregate over # aggregation function for each col # -- first value for const cols # -- deduplication function for uniq cols aggd = { k: get_unique_aggvals if k in uniq else get_first_aggval for k in dkeys } # qns['response'] = qns.response.astype(str) res = (qns.groupby(group_cols).agg(aggd).reset_index().pipe( lambda xf: u.fill_none(xf)).to_dict(orient='records')) return res
def fetch_stats_by(des, qn_f, r, vs): lvl_f = '~%s' % '+'.join(vs) ct_f = '%s + %s' % (lvl_f, qn_f[1:]) logger.info('gen stats for interaction level', lvl_f=lvl_f, qn_f=qn_f, ct_f=ct_f, r=r) cols = vs + ['mean', 'se', 'ci_l', 'ci_u'] df = svybyci_xlogit(Formula(qn_f), Formula(lvl_f), des, svyciprop_xlogit, vartype=['se', 'ci']) df = pandas2ri.ri2py(df) df.columns = cols df = df.set_index(vs) cts = svyby_nodrop(Formula(lvl_f), Formula(ct_f), des, rsvy.unwtd_count, keep_var=True) cts = pandas2ri.ri2py(cts).fillna(0.0) cts.columns = vs + ['eql', 'ct', 'se_ignore'] cts = cts.set_index(vs) cts['eql'] = cts.eql.apply(lambda x: x == 'TRUE' if type(x) == str else x > 0) counts = cts.ct[cts.eql == True].tolist() ssizes = cts.groupby(vs).sum()['ct'] df = df.assign(count=counts, sample_size=ssizes) if df.shape[0] > 0: df['response'] = r df['level'] = len(vs) rdf = u.fill_none(df.round(DECIMALS)).reset_index() logger.info('create svyby df', df=rdf, vars=vs, eq=cts) return rdf