def winsorize(df: pd.DataFrame, by: Union[str, Iterable[str]], p: Iterable[Union[tuple, float]] = (.01, .99)): """Drop variables in `by' outside quantiles `p`.""" # TODO: Some kind of warning/error if too fine of quantiles are # requested for the number of rows, e.g. .99 with 5 rows. df = df.copy() by = cast(Iterable[str], force_iterable(by)) # Allow different cutoffs for different variables if hasattr(p[0], '__iter__'): assert len(p) == len(by) else: p = [p] * len(by) survive_winsor = np.array([True] * df.shape[0]) for idx, col in enumerate(by): cuts = df[col].quantile(p[idx]).values survive_this = np.logical_and(df[col] >= cuts[0], df[col] <= cuts[1]) survive_winsor = np.minimum(survive_winsor, survive_this) df = df[survive_winsor] return df
def outreg(regs, var_names=None, var_labels=None, digits=4, stars=True, se="(", options=False): """Create the guts of a Latex tabular enviornment from regression results. Args: regs (Results or iterable of Results): Regressions to output to table. var_names (str or iterable of str): Variable names to pull from regs. If none specified, by default uses the pandas dataframe colum names. var_labels (str or iterable of str): Pretty names for variables in table. If none specified, will use var_names. Keyword Args: digits (int): Defaults to 4. How many digits to include past decimal. stars (bool): Defaults to True. If True, adds stars to mark statistical significance. se (str): Defaults to "(". Marker for standard errors. May also choose brackets with ``se="["``. options (bool): Default to False: If True, return a ``dict`` with formatting options that were generated by ``outreg``: ``name_just``, ``stat_just``, etc., for additional calls to ``table_mainrow`` and ``table_statrow``. Return: str: LaTeX fragment meant to be wrapped in a tabular environment. """ regs = force_iterable(regs) if var_names is None: var_names = regs[0].beta.index.tolist() if len(regs) > 1: for reg in regs[1:]: var_names += [ x for x in reg.beta.index.tolist() if x not in var_names ] if var_labels is None: var_labels = [x if type(x) is str else str(x) for x in var_names] opt_dict = _set_options(var_labels, digits, stars) table_str = '' for var_idx, varname in enumerate(var_names): table_str += table_mainrow(var_labels[var_idx], varname, regs, **opt_dict) if options: return table_str, opt_dict else: return table_str
def table_mainrow(rowname, varname, regs, name_just=24, stat_just=12, digits=3, se="(", stars=True): """Add a table row of regression coefficients with standard errors. Args: rowname (str): First cell of table row, i.e., the row's name. varname (str): Name of variable to pull from ``Results`` object. regs (Results or iterable of Results): Regressions from which to pull coefficients named ``varname``. Keyword Args: name_just (int): stat_just (int): digits (int): se (str): stars (bool): Returns: str: String of table row. """ # Start beta and SE rows beta_vals = [] se_vals = [] # Extract beta/sig and se values to pass to `table_statrow` for reg in force_iterable(regs): if type(reg) is not Results or varname not in reg.beta: beta_vals.append('') se_vals.append('') else: # Beta and stars this_beta = _format_nums(reg.beta[varname], digits=digits) if stars: this_sig = _sig_level(reg.pt[varname]) else: this_sig = '' beta_vals.append(this_beta + this_sig) # Standard Error this_se = reg.se[varname] se_vals.append(this_se) beta_row = table_statrow(rowname, beta_vals, name_just=name_just, stat_just=stat_just) se_row = table_statrow('', se_vals, name_just=name_just, stat_just=stat_just, sd=se, digits=digits) full_row = beta_row + se_row return full_row
def outreg(regs, var_names=None, var_labels=None, digits=4, stars=True, se="(", options=False): """Create the guts of a Latex tabular enviornment from regression results. Args: regs (Results or iterable of Results): Regressions to output to table. var_names (str or iterable of str): Variable names to pull from regs. If none specified, by default uses the pandas dataframe colum names. var_labels (str or iterable of str): Pretty names for variables in table. If none specified, will use var_names. Keyword Args: digits (int): Defaults to 4. How many digits to include past decimal. stars (bool): Defaults to True. If True, adds stars to mark statistical significance. se (str): Defaults to "(". Marker for standard errors. May also choose brackets with ``se="["``. options (bool): Default to False: If True, return a ``dict`` with formatting options that were generated by ``outreg``: ``name_just``, ``stat_just``, etc., for additional calls to ``table_mainrow`` and ``table_statrow``. Return: str: LaTeX fragment meant to be wrapped in a tabular environment. """ regs = force_iterable(regs) if var_names is None: var_names = regs[0].beta.index.tolist() if len(regs) > 1: for reg in regs[1:]: var_names += [x for x in reg.beta.index.tolist() if x not in var_names] if var_labels is None: var_labels = [x if type(x) is str else str(x) for x in var_names] opt_dict = _set_options(var_labels, digits, stars) table_str = '' for var_idx, varname in enumerate(var_names): table_str += table_mainrow(var_labels[var_idx], varname, regs, **opt_dict) if options: return table_str, opt_dict else: return table_str
def test_string(self): a_string = 'abcd' expected = (a_string,) result = force_iterable(a_string) assert expected == result
def test_array(self): expected = np.arange(3) result = force_iterable(expected) assert_array_equal(expected, result)
def test_tup(self): expected = (1, 2, 3) result = force_iterable(expected) assert expected == result
def test_int(self): an_int = 10 expected = (an_int,) result = force_iterable(an_int) assert expected == result
def test_list(self): expected = [1, 2, 3] result = force_iterable(expected) assert expected == result