def winsorize(df, by, p=(.01, .99)): """Drop variables in `by' outside quantiles `p`.""" # TODO: Some kind of warning/error if too fine of quantiles are # requested for the number of rows, e.g. .99 with 5 rows. df = df.copy() by = force_iterable(by) # Allow different cutoffs for different variables if hasattr(p[0], '__iter__'): assert len(p) == len(by) else: p = [p] * len(by) survive_winsor = np.array([True] * df.shape[0]) for idx, col in enumerate(by): cuts = df[col].quantile(p[idx]).values survive_this = np.logical_and(df[col] >= cuts[0], df[col] <= cuts[1]) survive_winsor = np.minimum(survive_winsor, survive_this) df = df[survive_winsor] return df
def test_string(self): a_string = 'abcd' expected = (a_string, ) result = force_iterable(a_string) assert_equal(expected, result)
def test_tup(self): expected = (1, 2, 3) result = force_iterable(expected) assert_equal(expected, result)
def test_array(self): expected = np.arange(3) result = force_iterable(expected) assert_array_equal(expected, result)
def test_int(self): an_int = 10 expected = (an_int, ) result = force_iterable(an_int) assert_equal(expected, result)
def test_list(self): expected = [1, 2, 3] result = force_iterable(expected) assert_equal(expected, result)
def test_tup(self): expected = (1, 2, 3) result = force_iterable(expected) assert expected == result
def test_int(self): an_int = 10 expected = (an_int, ) result = force_iterable(an_int) assert expected == result
def test_list(self): expected = [1, 2, 3] result = force_iterable(expected) assert expected == result