def test_quantile_datetime(self): df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]}) # exclude datetime result = df.quantile(.5) expected = Series([2.5], index=['b']) # datetime result = df.quantile(.5, numeric_only=False) expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5], index=['a', 'b'], name=0.5) assert_series_equal(result, expected) # datetime w/ multi result = df.quantile([.5], numeric_only=False) expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]], index=[.5], columns=['a', 'b']) assert_frame_equal(result, expected) # axis = 1 df['c'] = pd.to_datetime(['2011', '2012']) result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False) expected = Series([Timestamp('2010-07-02 12:00:00'), Timestamp('2011-07-02 12:00:00')], index=[0, 1], name=0.5) assert_series_equal(result, expected) result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False) expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), Timestamp('2011-07-02 12:00:00')]], index=[0.5], columns=[0, 1]) assert_frame_equal(result, expected)
def test_quantile_nat(self): # full NaT column df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]}) res = df.quantile(0.5, numeric_only=False) exp = Series([pd.NaT], index=['a'], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) exp = DataFrame({'a': [pd.NaT]}, index=[0.5]) tm.assert_frame_equal(res, exp) # mixed non-null / full null column df = DataFrame({'a': [pd.Timestamp('2012-01-01'), pd.Timestamp('2012-01-02'), pd.Timestamp('2012-01-03')], 'b': [pd.NaT, pd.NaT, pd.NaT]}) res = df.quantile(0.5, numeric_only=False) exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5], columns=['a', 'b']) tm.assert_frame_equal(res, exp)
def test_quantile_empty(self): # floats df = DataFrame(columns=['a', 'b'], dtype='float64') res = df.quantile(0.5) exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5]) exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5]) tm.assert_frame_equal(res, exp) # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) # res = df.quantile(0.5, axis=1) # res = df.quantile([0.5], axis=1) # ints df = DataFrame(columns=['a', 'b'], dtype='int64') # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) # res = df.quantile(0.5) # datetimes df = DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
def test_quantile_interpolation(self): # see gh-10174 from numpy import percentile # interpolation = linear (default case) q = self.tsframe.quantile(0.1, axis=0, interpolation='linear') assert q['A'] == percentile(self.tsframe['A'], 10) q = self.intframe.quantile(0.1) assert q['A'] == percentile(self.intframe['A'], 10) # test with and without interpolation keyword q1 = self.intframe.quantile(0.1) assert q1['A'] == np.percentile(self.intframe['A'], 10) tm.assert_series_equal(q, q1) # interpolation method other than default linear df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1, interpolation='nearest') expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) # cross-check interpolation=nearest results in original dtype exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5, axis=0, interpolation='nearest') expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64') tm.assert_series_equal(result, expected) # float df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1, interpolation='nearest') expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5, axis=0, interpolation='nearest') expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64') assert_series_equal(result, expected) # axis result = df.quantile([.5, .75], axis=1, interpolation='lower') expected = DataFrame({1: [1., 1.], 2: [2., 2.], 3: [3., 3.]}, index=[0.5, 0.75]) assert_frame_equal(result, expected) # test degenerate case df = DataFrame({'x': [], 'y': []}) q = df.quantile(0.1, axis=0, interpolation='higher') assert(np.isnan(q['x']) and np.isnan(q['y'])) # multi df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['a', 'b', 'c']) result = df.quantile([.25, .5], interpolation='midpoint') # https://github.com/numpy/numpy/issues/7163 expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[.25, .5], columns=['a', 'b', 'c']) assert_frame_equal(result, expected)
class Quantile(object): params = [0, 1] param_names = ['axis'] def setup(self, axis): self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) def time_frame_quantile(self, axis): self.df.quantile([0.1, 0.5], axis=axis)
def test_quantile_axis_mixed(self): # mixed on axis=1 df = DataFrame({"A": [1, 2, 3], "B": [2., 3., 4.], "C": pd.date_range('20130101', periods=3), "D": ['foo', 'bar', 'baz']}) result = df.quantile(.5, axis=1) expected = Series([1.5, 2.5, 3.5], name=0.5) assert_series_equal(result, expected) # must raise with pytest.raises(TypeError): df.quantile(.5, axis=1, numeric_only=False)
def test_quantile_axis_parameter(self): # GH 9543/9544 df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=0) expected = Series([2., 3.], index=["A", "B"], name=0.5) assert_series_equal(result, expected) expected = df.quantile(.5, axis="index") assert_series_equal(result, expected) result = df.quantile(.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) result = df.quantile(.5, axis="columns") assert_series_equal(result, expected) msg = ("No axis named -1 for object type" " <class 'pandas.core.frame.DataFrame'>") with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis=-1) msg = ("No axis named column for object type" " <class 'pandas.core.frame.DataFrame'>") with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis="column")
def test_quantile_multi(self): df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['a', 'b', 'c']) result = df.quantile([.25, .5]) expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], index=[.25, .5], columns=['a', 'b', 'c']) assert_frame_equal(result, expected) # axis = 1 result = df.quantile([.25, .5], axis=1) expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], index=[.25, .5], columns=[0, 1, 2]) # empty result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0) expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]}, index=[.1, .9]) assert_frame_equal(result, expected)
def test_empty_datelike( self, dtype, expected_data, expected_index, axis, expected_dtype ): # GH 14564 df = DataFrame(columns=["a", "b"], dtype=dtype) result = df.quantile(0.5, axis=axis, numeric_only=False) expected = Series( expected_data, name=0.5, index=Index(expected_index), dtype=expected_dtype ) tm.assert_series_equal(result, expected)
def test_quantile(self, datetime_frame): from numpy import percentile df = datetime_frame q = df.quantile(0.1, axis=0, numeric_only=True) assert q["A"] == percentile(df["A"], 10) tm.assert_index_equal(q.index, df.columns) q = df.quantile(0.9, axis=1, numeric_only=True) assert q["2000-01-17"] == percentile(df.loc["2000-01-17"], 90) tm.assert_index_equal(q.index, df.index) # test degenerate case q = DataFrame({"x": [], "y": []}).quantile(0.1, axis=0, numeric_only=True) assert np.isnan(q["x"]) and np.isnan(q["y"]) # non-numeric exclusion df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) rs = df.quantile(0.5, numeric_only=True) with tm.assert_produces_warning(FutureWarning, match="Select only valid"): xp = df.median().rename(0.5) tm.assert_series_equal(rs, xp) # axis df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(0.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) result = df.quantile([0.5, 0.75], axis=1) expected = DataFrame( {1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75] ) tm.assert_frame_equal(result, expected, check_index_type=True) # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 df = DataFrame([[1, 2, 3], ["a", "b", 4]]) result = df.quantile(0.5, axis=1, numeric_only=True) expected = Series([3.0, 4.0], index=[0, 1], name=0.5) tm.assert_series_equal(result, expected)
def test_quantile(self, datetime_frame): from numpy import percentile df = datetime_frame q = df.quantile(0.1, axis=0) assert q['A'] == percentile(df['A'], 10) tm.assert_index_equal(q.index, df.columns) q = df.quantile(0.9, axis=1) assert (q['2000-01-17'] == percentile(df.loc['2000-01-17'], 90)) tm.assert_index_equal(q.index, df.index) # test degenerate case q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) assert(np.isnan(q['x']) and np.isnan(q['y'])) # non-numeric exclusion df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]}) rs = df.quantile(0.5) xp = df.median().rename(0.5) assert_series_equal(rs, xp) # axis df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) result = df.quantile([.5, .75], axis=1) expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75]) assert_frame_equal(result, expected, check_index_type=True) # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 df = DataFrame([[1, 2, 3], ['a', 'b', 4]]) result = df.quantile(.5, axis=1) expected = Series([3., 4.], index=[0, 1], name=0.5) assert_series_equal(result, expected)
def test_quantile_empty_no_rows_floats(self): # floats df = DataFrame(columns=["a", "b"], dtype="float64") res = df.quantile(0.5) exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5]) exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5]) tm.assert_frame_equal(res, exp) res = df.quantile(0.5, axis=1) exp = Series([], index=[], dtype="float64", name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5], axis=1) exp = DataFrame(columns=[], index=[0.5]) tm.assert_frame_equal(res, exp)
def unemployment_estimation( duration: pandas.DataFrame) -> dict[str, Union[str, int]]: """TODO: add docstring.""" quantiles_values = list(_QUANTILES.values()) quantiles = duration.quantile(quantiles_values) estimation: dict[str, Union[str, int]] = {} for name, quantile in _QUANTILES.items(): estimation[f'{name}Days'] = int( typing.cast(float, quantiles.loc[quantile])) return finalize_duration_estimation(estimation)
def get_IQR(df:pd.DataFrame, k): ''' df : the original data k : the multiple of iqr for boundary ''' q1 = df.quantile(0.25) q3 = df.quantile(0.75) iqr = q3 - q1 lower_bound = pd.Series(q1 - (k * iqr), name='lower_bound') upper_bound = pd.Series(q3 + (k * iqr), name='upper_bound') return pd.concat([lower_bound, upper_bound], axis=1)
def test_quantile(self): from numpy import percentile q = self.tsframe.quantile(0.1, axis=0) self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) q = self.tsframe.quantile(0.9, axis=1) q = self.intframe.quantile(0.1) self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) # test degenerate case q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) assert (np.isnan(q['x']) and np.isnan(q['y'])) # non-numeric exclusion df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]}) rs = df.quantile(0.5) xp = df.median() assert_series_equal(rs, xp) # axis df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) assert_series_equal(result, expected) result = df.quantile([.5, .75], axis=1) expected = DataFrame({ 1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75] }, index=[0.5, 0.75]) assert_frame_equal(result, expected, check_index_type=True) # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 df = DataFrame([[1, 2, 3], ['a', 'b', 4]]) result = df.quantile(.5, axis=1) expected = Series([3., 4.], index=[0, 1]) assert_series_equal(result, expected)
def test_quantile_empty_no_rows_dt64(self): # datetimes df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]") res = df.quantile(0.5, numeric_only=False) exp = Series( [pd.NaT, pd.NaT], index=["a", "b"], dtype="datetime64[ns]", name=0.5 ) tm.assert_series_equal(res, exp) # Mixed dt64/dt64tz df["a"] = df["a"].dt.tz_localize("US/Central") res = df.quantile(0.5, numeric_only=False) exp = exp.astype(object) tm.assert_series_equal(res, exp) # both dt64tz df["b"] = df["b"].dt.tz_localize("US/Central") res = df.quantile(0.5, numeric_only=False) exp = exp.astype(df["b"].dtype) tm.assert_series_equal(res, exp)
def test_rolling_quantile_np_percentile(): # #9413: Tests that rolling window's quantile default behavior # is analogous to Numpy's percentile row = 10 col = 5 idx = pd.date_range("20100101", periods=row, freq="B") df = DataFrame(np.random.rand(row * col).reshape((row, -1)), index=idx) df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0) np_percentile = np.percentile(df, [25, 50, 75], axis=0) tm.assert_almost_equal(df_quantile.values, np.array(np_percentile))
def test_quantile_datetime(self): df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]}) # exclude datetime result = df.quantile(.5) expected = Series([2.5], index=['b']) # datetime result = df.quantile(.5, numeric_only=False) expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5], index=['a', 'b'], name=0.5) assert_series_equal(result, expected) # datetime w/ multi result = df.quantile([.5], numeric_only=False) expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]], index=[.5], columns=['a', 'b']) assert_frame_equal(result, expected) # axis = 1 df['c'] = pd.to_datetime(['2011', '2012']) result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False) expected = Series([ Timestamp('2010-07-02 12:00:00'), Timestamp('2011-07-02 12:00:00') ], index=[0, 1], name=0.5) assert_series_equal(result, expected) result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False) expected = DataFrame([[ Timestamp('2010-07-02 12:00:00'), Timestamp('2011-07-02 12:00:00') ]], index=[0.5], columns=[0, 1]) assert_frame_equal(result, expected)
def test_quantile_nan(self): # GH 14357 - float block where some cols have missing values df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)}) df.iloc[-1, 1] = np.nan res = df.quantile(0.5) exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5, 0.75]) exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) res = df.quantile(0.5, axis=1) exp = Series(np.arange(1.0, 6.0), name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5, 0.75], axis=1) exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) # full-nan column df['b'] = np.nan res = df.quantile(0.5) exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5, 0.75]) exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp)
def test_quantile_nan(self): # GH 14357 - float block where some cols have missing values df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) df.iloc[-1, 1] = np.nan res = df.quantile(0.5) exp = Series([3.0, 2.5], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5, 0.75]) exp = DataFrame({"a": [3.0, 4.0], "b": [2.5, 3.25]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) res = df.quantile(0.5, axis=1) exp = Series(np.arange(1.0, 6.0), name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5, 0.75], axis=1) exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) # full-nan column df["b"] = np.nan res = df.quantile(0.5) exp = Series([3.0, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5, 0.75]) exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp)
def test_quantile_date_range(self): # GH 2460 dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") ser = Series(dti) df = DataFrame(ser) result = df.quantile(numeric_only=False) expected = Series( ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]" ) tm.assert_series_equal(result, expected)
def test_numeric_only_default_false_warning(self, non_num_col): # GH #7308 df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}) df["C"] = non_num_col expected = Series( [2.0, 3.0], index=["A", "B"], name=0.5, ) with tm.assert_produces_warning(FutureWarning, match="numeric_only"): result = df.quantile(0.5) tm.assert_series_equal(result, expected)
def test_quantile_multi(self): df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) result = df.quantile([0.25, 0.5]) expected = DataFrame( [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=["a", "b", "c"], ) assert_frame_equal(result, expected) # axis = 1 result = df.quantile([0.25, 0.5], axis=1) expected = DataFrame( [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=[0, 1, 2] ) # empty result = DataFrame({"x": [], "y": []}).quantile([0.1, 0.9], axis=0) expected = DataFrame( {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9] ) assert_frame_equal(result, expected)
def test_quantile_datetime(self): df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]}) # exclude datetime result = df.quantile(0.5) expected = Series([2.5], index=["b"]) # datetime result = df.quantile(0.5, numeric_only=False) expected = Series( [Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5 ) assert_series_equal(result, expected) # datetime w/ multi result = df.quantile([0.5], numeric_only=False) expected = DataFrame( [[Timestamp("2010-07-02 12:00:00"), 2.5]], index=[0.5], columns=["a", "b"] ) assert_frame_equal(result, expected) # axis = 1 df["c"] = pd.to_datetime(["2011", "2012"]) result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False) expected = Series( [Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")], index=[0, 1], name=0.5, ) assert_series_equal(result, expected) result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False) expected = DataFrame( [[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]], index=[0.5], columns=[0, 1], ) assert_frame_equal(result, expected)
def test_quantile_axis_parameter(self): # GH 9543/9544 df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=0) expected = Series([2., 3.], index=["A", "B"], name=0.5) assert_series_equal(result, expected) expected = df.quantile(.5, axis="index") assert_series_equal(result, expected) result = df.quantile(.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) result = df.quantile(.5, axis="columns") assert_series_equal(result, expected) pytest.raises(ValueError, df.quantile, 0.1, axis=-1) pytest.raises(ValueError, df.quantile, 0.1, axis="column")
def test_quantile(self): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [nan, nan]] q = 0.1 sparse_df = SparseDataFrame(data) result = sparse_df.quantile(q) dense_df = DataFrame(data) dense_expected = dense_df.quantile(q) sparse_expected = SparseSeries(dense_expected) tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected)
def test_quantile_multi(self): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [nan, nan]] q = [0.1, 0.5] sparse_df = SparseDataFrame(data) result = sparse_df.quantile(q) dense_df = DataFrame(data) dense_expected = dense_df.quantile(q) sparse_expected = SparseDataFrame(dense_expected) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected)
def _create_summary(data: pd.DataFrame, original): summary = pd.DataFrame(0, index=data.columns, columns=[ "original", "mean", "std.error", "perc.025", "perc.975", "t stat." ]) summary.loc[:, "mean"] = data.mean(axis=0) summary.loc[:, "std.error"] = data.std(axis=0) summary.loc[:, "perc.025"] = data.quantile(0.025, axis=0) summary.loc[:, "perc.975"] = data.quantile(0.975, axis=0) summary.loc[:, "original"] = original summary.loc[:, "t stat."] = original / data.std(axis=0) return summary
def test_quantile(): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]] q = 0.1 sparse_df = SparseDataFrame(data) result = sparse_df.quantile(q) dense_df = DataFrame(data) dense_expected = dense_df.quantile(q) sparse_expected = SparseSeries(dense_expected) tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected)
def test_quantile_multi(): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]] q = [0.1, 0.5] sparse_df = SparseDataFrame(data) result = sparse_df.quantile(q) dense_df = DataFrame(data) dense_expected = dense_df.quantile(q) sparse_expected = SparseDataFrame(dense_expected) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected)
def test_quantile_multi(self): df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['a', 'b', 'c']) result = df.quantile([.25, .5]) expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], index=[.25, .5], columns=['a', 'b', 'c']) assert_frame_equal(result, expected) # axis = 1 result = df.quantile([.25, .5], axis=1) expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], index=[.25, .5], columns=[0, 1, 2]) # empty result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0) expected = DataFrame({ 'x': [np.nan, np.nan], 'y': [np.nan, np.nan] }, index=[.1, .9]) assert_frame_equal(result, expected)
def testWordParser(self): ''' try finding quantiles see https://stackoverflow.com/questions/2374640/how-do-i-calculate-percentiles-with-python-numpy ''' lookup = Lookup("test Word parser") sqlDB = lookup.getSQLDB() if sqlDB is not None: totalWordUsages = [] for source in ['wikidata', 'crossref', 'dblp', 'CEUR-WS']: listOfDicts = TestWordParser.getProceedingsTitles( sqlDB, source) cwp = CorpusWordParser() wordusages = cwp.parse(listOfDicts) lens = {} for wordusage in wordusages: totalWordUsages.append(wordusage.__dict__) if wordusage.eventId in lens: lens[wordusage.eventId] += 1 else: lens[wordusage.eventId] = 1 df = DataFrame(lens.values()) print(df.quantile(1)) quantileValues = df.quantile(.90) print(quantileValues) plot = Plot(lens.values(), "%s wordcount histogram" % source, xlabel="wordcount", ylabel="frequency") plot.hist(mode='save') wordUsageDBFile = Lookup.getDBFile("wordusage") wSQLDB = SQLDB(wordUsageDBFile) entityInfo = wSQLDB.createTable(totalWordUsages, "wordusage", withDrop=True) wSQLDB.store(totalWordUsages, entityInfo)
def get_outliers_iqr(df: DataFrame, iqr_mul=3) -> DataFrame: """ Return upper and lower bound of outliers from pandas dataframe based on IQR indicator. :param df: Pandas dataFrame :param IQR_mul : IQR_mult > 1.5 - normal outliers and extreme outliers, IQR_mul > 3 - extreme outliers :return lower_outliers, upper outlliers : """ Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 lower_outliers = df < (Q1 - iqr_mul * IQR) upper_outliers = df > (Q3 + iqr_mul * IQR) return lower_outliers, upper_outliers
def bucketize(feature: pd.DataFrame, fc: tf.feature_column.numeric_column, n_bins: int): '''Bin pandas series in dataframe examples. Args: feature: pandas.Series fc: tensorflow.feature_column.numeric_column n_bins: int Returns: tensorflow.feature_column.bucketized_column ''' qs = list(feature.quantile(np.linspace(0, 1, n_bins + 1))) return tf.feature_column.bucketized_column(fc, qs)
def outliers(df: pd.DataFrame): q1 = df.quantile(0.25) q3 = df.quantile(0.75) iqr = q3 - q1 low_boundary = (q1 - 1.5 * iqr) upp_boundary = (q3 + 1.5 * iqr) num_of_out_L = (df[iqr.index] < low_boundary).sum() num_of_out_U = (df[iqr.index] > upp_boundary).sum() outliers = pd.DataFrame({ 'lower_value': low_boundary, 'upper_boundary': upp_boundary, 'num_of_outliers_L': num_of_out_L, 'num_of_outliers_U': num_of_out_U }) print(outliers)
def get_evaluation_stats(results: pd.DataFrame) -> pd.Series: """ Computes model result statistics. Args: results: Model evaluation results. Returns: Model statistics where each row includes a different statistic. """ stats = results.describe() stats['95%'] = results.quantile(.95) stats['99%'] = results.quantile(.99) return stats
def test_quantile_axis_mixed(self): # mixed on axis=1 df = DataFrame({"A": [1, 2, 3], "B": [2., 3., 4.], "C": pd.date_range('20130101', periods=3), "D": ['foo', 'bar', 'baz']}) result = df.quantile(.5, axis=1) expected = Series([1.5, 2.5, 3.5], name=0.5) assert_series_equal(result, expected) # must raise def f(): df.quantile(.5, axis=1, numeric_only=False) self.assertRaises(TypeError, f)
def remove_outliers(dataset: pd.DataFrame, strategy='Z', reindex=True, threshold=3) -> \ pd.DataFrame: """ A method that removes outliers from a dataset that contains no null values. Two strategies can be used for outliers' removal: z-score and IQR. In case the dataset contains less than 12 values only IQR strategy can be used. :param dataset: A dataset to remove outliers form containing no null values. :param strategy: A strategy for removal (Z or IQR). :param reindex: A new dataset will create new indexes if True. :param threshold: A threshold for a value to be considered outliers in case Z-score was chosen. :return: DataFrame containing no outliers. """ if dataset.count()[0] < 12: strategy = 'IQR' if strategy.lower() == 'z': cols = list(dataset.columns) z_scores = pd.DataFrame() for col in cols: if np.issubdtype(dataset[col].dtype, np.number): col_zscore = col + '_zscore' z_scores[col_zscore] = np.abs(stats.zscore(dataset[col])) # noinspection PyTypeChecker no_outliers_dataset = dataset[(z_scores < threshold).all(axis=1)] else: first_quartile = dataset.quantile(0.25) third_quartile = dataset.quantile(0.75) iqr = third_quartile - first_quartile # noinspection PyTypeChecker no_outliers_dataset = dataset[~( (dataset < (third_quartile - 1.5 * iqr)) | (dataset > (third_quartile + 1.5 * iqr))).any(axis=1)] no_outliers_dataset = no_outliers_dataset.reset_index( drop=True) if reindex else no_outliers_dataset return no_outliers_dataset
def _get_quantiles(df: pd.DataFrame, feats: List[str], filter_debug: bool = True, filter_continue: bool = True) -> Dict[str, List[float]]: filter_strings = [] if filter_debug: filter_strings += ['(debug==0)'] if filter_continue: filter_strings += ['(c==0)'] if filter_strings: df = df.rename({ "continue": "c" }, axis=1).query(' & '.join(filter_strings)).rename({"c": "continue"}, axis=1) df = df[feats].replace(0.0, pd.NA) df = df.quantile(np.arange(0, 1, .01)) quantiles = df.to_dict('list') return quantiles
def get_outliers(df: pd.DataFrame) -> (int, str): """ Vyfiltrujte odlehle hodnoty (outliers) ve sloupecku "Fare" pomoci metody IRQ. Tedy spocitejte rozdil 3. a 1. kvantilu, tj. IQR = Q3 - Q1. Pote odfiltrujte vsechny hodnoty nesplnujici: Q1 - 1.5*IQR < "Fare" < Q3 + 1.5*IQR. Namalujte box plot pro sloupec "Fare" pred a po vyfiltrovani outlieru. Vratte tuple obsahujici pocet outlieru a jmeno cestujiciho pro nejvetsi outlier. """ Q1 = df.quantile(0.25)["Fare"] Q3 = df.quantile(0.75)["Fare"] IQR = Q3 - Q1 df_out = df[~((df["Fare"] < (Q1 - 1.5 * IQR)) | (df["Fare"] > (Q3 + 1.5 * IQR)))] df.boxplot(column="Fare") plt.show() df_out.boxplot(column="Fare") plt.show() return (len(df) - len(df_out), df.iloc[df.index[df["Fare"] == max( list(df["Fare"]))].tolist()[0]]["Name"])
def calculate_quantiles( data: pd.DataFrame ) -> Tuple[List[str], ...]: """ Calculate quantiles of a Pandas Series Args: ----- - data: a pandas DataFrame Return: ------- A tuple(list(str...)) """ q0, q25, q50, q75, q100 = data.quantile([.0, .25, .50, .75, 1.0]).values q0 = [str(value) for value in q0] q25 = [str(value) for value in q25] q50 = [str(value) for value in q50] q75 = [str(value) for value in q75] q100 = [str(value) for value in q100] return (q0, q25, q50, q75, q100)
def fit( self: T_Self, X: pd.DataFrame, y: Optional[Union[pd.Series, pd.DataFrame]] = None, **fit_params, ) -> T_Self: """ Fit the transformer. :return: the fitted transformer """ self: OutlierRemoverDF # support type hinting in PyCharm q1: pd.Series = X.quantile(q=0.25) q3: pd.Series = X.quantile(q=0.75) threshold_iqr: pd.Series = (q3 - q1) * self.iqr_multiple self.threshold_low_ = q1 - threshold_iqr self.threshold_high_ = q3 + threshold_iqr self._features_original = X.columns.to_series() return self
def test_quantile_interpolation_np_lt_1p9(self): # GH #10174 if not _np_version_under1p9: raise nose.SkipTest("Numpy version is greater than 1.9") from numpy import percentile # interpolation = linear (default case) q = self.tsframe.quantile(0.1, axis=0, interpolation='linear') self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) q = self.intframe.quantile(0.1) self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) # test with and without interpolation keyword q1 = self.intframe.quantile(0.1) self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10)) assert_series_equal(q, q1) # interpolation method other than default linear expErrMsg = "Interpolation methods other than linear" df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) with assertRaisesRegexp(ValueError, expErrMsg): df.quantile(.5, axis=1, interpolation='nearest') with assertRaisesRegexp(ValueError, expErrMsg): df.quantile([.5, .75], axis=1, interpolation='lower') # test degenerate case df = DataFrame({'x': [], 'y': []}) with assertRaisesRegexp(ValueError, expErrMsg): q = df.quantile(0.1, axis=0, interpolation='higher') # multi df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['a', 'b', 'c']) with assertRaisesRegexp(ValueError, expErrMsg): df.quantile([.25, .5], interpolation='midpoint')
def test_quantile_box(self): df = DataFrame({'A': [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-03')], 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'), pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timestamp('2011-01-03', tz='US/Eastern')], 'C': [pd.Timedelta('1 days'), pd.Timedelta('2 days'), pd.Timedelta('3 days')]}) res = df.quantile(0.5, numeric_only=False) exp = pd.Series([pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timedelta('2 days')], name=0.5, index=['A', 'B', 'C']) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) exp = pd.DataFrame([[pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timedelta('2 days')]], index=[0.5], columns=['A', 'B', 'C']) tm.assert_frame_equal(res, exp) # DatetimeBlock may be consolidated and contain NaT in different loc df = DataFrame({'A': [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-03')], 'a': [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.NaT, pd.Timestamp('2011-01-03')], 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'), pd.NaT, pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timestamp('2011-01-03', tz='US/Eastern')], 'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'), pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.NaT, pd.Timestamp('2011-01-03', tz='US/Eastern')], 'C': [pd.Timedelta('1 days'), pd.Timedelta('2 days'), pd.Timedelta('3 days'), pd.NaT], 'c': [pd.NaT, pd.Timedelta('1 days'), pd.Timedelta('2 days'), pd.Timedelta('3 days')]}, columns=list('AaBbCc')) res = df.quantile(0.5, numeric_only=False) exp = pd.Series([pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timedelta('2 days'), pd.Timedelta('2 days')], name=0.5, index=list('AaBbCc')) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) exp = pd.DataFrame([[pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timedelta('2 days'), pd.Timedelta('2 days')]], index=[0.5], columns=list('AaBbCc')) tm.assert_frame_equal(res, exp)