Пример #1
0
def fetch_stats_totals(des, qn_f, r):
    total_ci = svyciprop_xlogit(Formula(qn_f), des, multicore=False)
    # extract stats
    logger.info('fetching stats totals', r=r, q=qn_f)
    cts = rsvy.svyby(Formula(qn_f), Formula(qn_f), des,
                     rsvy.unwtd_count, na_rm=True,
                     na_rm_by=True, na_rm_all=True, multicore=False)
    cts = pandas2ri.ri2py(cts)
    cols = ['eql', 'ct', 'se_ignore']
    cts.columns = cols
    ct = cts.ct[cts.eql == 1].sum()
    ss = cts.ct.sum()
    res = {'level': 0,
           'response': r,
           'mean': u.guard_nan(
               rbase.as_numeric(total_ci)[0]) if total_ci else None,
           'se': u.guard_nan(
               rsvy.SE(total_ci)[0]) if total_ci else None,
           'ci_l': u.guard_nan(
               rbase.attr(total_ci, 'ci')[0]) if total_ci else None,
           'ci_u': u.guard_nan(
               rbase.attr(total_ci, 'ci')[1]) if total_ci else None,
           'count': ct,
           'sample_size': ss
           }
    # round as appropriate
    logger.info('finished computation lvl1', res=res,
                total_ci=total_ci, ct=ct, ss=ss)
    res = pd.DataFrame([res]).round(DECIMALS)
    return u.fill_none(res)
Пример #2
0
 def fetch_socrata(self, qn, vars, filt={}):
     vars = self.mapper(vars)
     filt = self.mapper(filt)
     sel = None
     df = self.soc
     fcts = list(set(self.meta.facets).intersection(self.soc.fields))
     sel = df[ID_COLUMN] == qn
     site_col = self.mapper('sitecode')
     year_col = self.mapper('year')
     if site_col in filt.keys():
         sel = sel & df[site_col].isin(filt[site_col])
     elif site_col not in vars:
         sel = sel & (df[site_col] == 'XX')
     if year_col in filt.keys():
         sel = sel & df[year_col].isin(filt[year_col])
     elif year_col not in vars:
         sel = sel & (df[year_col] == 'Total')
     for v in fcts:
         if v in filt.keys():
             sel = sel & df[v].isin(filt[v])
         elif v not in vars:
             sel = sel & (df[v].isin(['Total', None]))
     cols = (
         set(ANNO_COLUMNS).union(vars).union(STATS_COLUMNS).intersection(
             self.soc.fields))
     cols = list(cols)
     dfz = odo(df[sel][cols], pd.DataFrame)
     stats_sub = list(set(STATS_COLUMNS).intersection(dfz.columns))
     dfz[stats_sub] = dfz[stats_sub].apply(
         lambda xf: xf.astype(float).replace(-1.0, np.nan))
     # logger.info('done filtering, replacing NaNs', dfz=dfz)
     return u.fill_none(dfz.round(DECIMALS).reset_index(drop=True))
Пример #3
0
 def facet_map(self):
     facs = (self.flevels.groupby(['facet']).agg({
         'facet_level':
         lambda x: x.dropna().drop_duplicates().tolist()
     }).pipe(lambda xf: u.fill_none(xf)).to_dict(orient='index'))
     return pipe(facs,
                 curry(valmap)(lambda x: x['facet_level']),
                 curry(keyfilter)(lambda x: x != 'Overall'),
                 lambda x: merge(x, self.flevels_r))
Пример #4
0
def fetch_stats(des, qn, r, vs=[], filt={}):
    # ex: ~qn8
    rbase.gc()
    gc.collect()
    qn_f = '~I(%s=="%s")' % (qn, r)
    logger.info('subsetting des with filter', filt=filt)
    des = subset_survey(des, filt)
    logger.info('done subsetting')
    dfs = [fetch_stats_totals(des, qn_f, r)]
    levels = [vs[:k+1] for k in range(len(vs))]
    sts = map(lambda lvl: fetch_stats_by(des, qn_f, r, lvl), levels)
    dfz = pd.concat(dfs + sts, ignore_index=True)
    # get stats_by_fnats for each level of interactions in vars
    # using svyby to compute across combinations of loadings
    logger.info('finished computations, appending dfs', dfs=dfz)
    return u.fill_none(dfz)  # .round(DECIMALS)
Пример #5
0
    def questions(self):
        def get_first_aggval(xf):
            try:
                return xf.dropna().astype(str).get_values()[0]
            except Exception as e:
                return None

        # optional metadata to add to questions
        opts = (['year'] if 'year' in list(self.qns.columns) else []) + \
            (['sitecode'] if 'sitecode' in list(self.qns.columns) else [])
        # columns to exclude before aggregating question metadata
        # consists of the groupby column, qid, plus unused optional cols
        qns = self.qns
        group_cols = [ID_COLUMN, 'topic', 'subtopic']
        if self.qns_r is not None:
            qns['q_orig'] = qns.question.astype(str)
            qns['question'] = self.qns_r[qns.qid].reset_index(drop=True)
            qns['question'] = qns[['question', 'q_orig']].apply(
                lambda x: x.q_orig if pd.isnull(x.question) else x.question,
                axis=1)
            #incl_cols = incl_cols + ['question_orig']
            #group_cols = group_cols + ['question_orig']
        # columns that need to be uniqued -- otherwise assume constant for each qid
        uniq = ['year', 'sitecode', 'response']
        dkeys = set(self.qns.columns).difference(group_cols +
                                                 ['facet', 'facet_level'])
        # columns to aggregate over
        # aggregation function for each col
        # -- first value for const cols
        # -- deduplication function for uniq cols
        aggd = {
            k: get_unique_aggvals if k in uniq else get_first_aggval
            for k in dkeys
        }
        # qns['response'] = qns.response.astype(str)
        res = (qns.groupby(group_cols).agg(aggd).reset_index().pipe(
            lambda xf: u.fill_none(xf)).to_dict(orient='records'))
        return res
Пример #6
0
def fetch_stats_by(des, qn_f, r, vs):
    lvl_f = '~%s' % '+'.join(vs)
    ct_f = '%s + %s' % (lvl_f, qn_f[1:])
    logger.info('gen stats for interaction level', lvl_f=lvl_f, qn_f=qn_f, ct_f=ct_f, r=r)
    cols = vs + ['mean', 'se', 'ci_l', 'ci_u']
    df = svybyci_xlogit(Formula(qn_f), Formula(lvl_f), des, svyciprop_xlogit, vartype=['se', 'ci'])
    df = pandas2ri.ri2py(df)
    df.columns = cols
    df = df.set_index(vs)
    cts = svyby_nodrop(Formula(lvl_f), Formula(ct_f), des, rsvy.unwtd_count, keep_var=True)
    cts = pandas2ri.ri2py(cts).fillna(0.0)
    cts.columns = vs + ['eql', 'ct', 'se_ignore']
    cts = cts.set_index(vs)
    cts['eql'] = cts.eql.apply(lambda x: x == 'TRUE' if type(x) == str else x > 0)
    counts = cts.ct[cts.eql == True].tolist()
    ssizes = cts.groupby(vs).sum()['ct']
    df = df.assign(count=counts, sample_size=ssizes)
    if df.shape[0] > 0:
        df['response'] = r
        df['level'] = len(vs)
    rdf = u.fill_none(df.round(DECIMALS)).reset_index()
    logger.info('create svyby df', df=rdf, vars=vs, eq=cts)
    return rdf