def _levels_dict(demographics): """Create a dictionary of indices and corresponding demographic levels Parameters ---------- demographics : pd.Series Demographic labels Returns ------- levels_dict : dict """ levels = _levels(demographics) levels_dict = {k : v for k, v in enumerate(levels)} return levels_dict
def _arrs_pos(df_orig, df_pos, demographic, pos, d_levels=None, print_levels=False): """Individual part-of-speech arrays for a particular demographic Parameters ---------- df_orig : pd.DataFrame The DataFrame from which `df_pos` was created df_pos : pd.DataFrame The part-of-speech DataFrame demographic : str A valid demographic-data column in `df_orig` pos : str A column in `df_pos` corresponding to a part of speech d_levels : list, default None The specific demographic levels desired print_levels : bool, default False Whether to print the demographic levels Returns ------- arrs : tuple of np.arrays The corresponding `pos` values for each `demographic` """ df_pos = df_pos.copy() # so we don't modify it df_pos[demographic] = df_orig[demographic].values levels = _levels(df_orig[demographic], d_levels, print_levels) arrs = [] for d in levels: arr = df_pos[df_pos[demographic] == d][pos].values n = arr.shape[0] if n < 0.1 * df_pos.shape[0]: print("Warning: '" + d + "' category has less than 10% of observations (" + str(n) + ")") arrs.append(arr) return tuple(arrs)