def test_pivot(self): data = { 'index': ['A', 'B', 'C', 'C', 'B', 'A'], 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], 'values': [1., 2., 3., 3., 2., 1.] } frame = DataFrame(data) pivoted = frame.pivot( index='index', columns='columns', values='values') expected = DataFrame({ 'One': {'A': 1., 'B': 2., 'C': 3.}, 'Two': {'A': 1., 'B': 2., 'C': 3.} }) expected.index.name, expected.columns.name = 'index', 'columns' assert_frame_equal(pivoted, expected) # name tracking self.assertEqual(pivoted.index.name, 'index') self.assertEqual(pivoted.columns.name, 'columns') # don't specify values pivoted = frame.pivot(index='index', columns='columns') self.assertEqual(pivoted.index.name, 'index') self.assertEqual(pivoted.columns.names, (None, 'columns')) # pivot multiple columns wp = tm.makePanel() lp = wp.to_frame() df = lp.reset_index() assert_frame_equal(df.pivot('major', 'minor'), lp.unstack())
def test_pivot_periods(self): df = DataFrame({'p1': [pd.Period('2013-01-01', 'D'), pd.Period('2013-01-02', 'D'), pd.Period('2013-01-01', 'D'), pd.Period('2013-01-02', 'D')], 'p2': [pd.Period('2013-01', 'M'), pd.Period('2013-01', 'M'), pd.Period('2013-02', 'M'), pd.Period('2013-02', 'M')], 'data1': np.arange(4, dtype='int64'), 'data2': np.arange(4, dtype='int64')}) exp_col1 = Index(['data1', 'data1', 'data2', 'data2']) exp_col2 = pd.PeriodIndex(['2013-01', '2013-02'] * 2, name='p2', freq='M') exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]], index=pd.PeriodIndex(['2013-01-01', '2013-01-02'], name='p1', freq='D'), columns=exp_col) pv = df.pivot(index='p1', columns='p2') tm.assert_frame_equal(pv, expected) expected = DataFrame([[0, 2], [1, 3]], index=pd.PeriodIndex(['2013-01-01', '2013-01-02'], name='p1', freq='D'), columns=pd.PeriodIndex(['2013-01', '2013-02'], name='p2', freq='M')) pv = df.pivot(index='p1', columns='p2', values='data1') tm.assert_frame_equal(pv, expected)
def test_pivot(self): data = { "index": ["A", "B", "C", "C", "B", "A"], "columns": ["One", "One", "One", "Two", "Two", "Two"], "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } frame = DataFrame(data) pivoted = frame.pivot(index="index", columns="columns", values="values") expected = DataFrame({"One": {"A": 1.0, "B": 2.0, "C": 3.0}, "Two": {"A": 1.0, "B": 2.0, "C": 3.0}}) expected.index.name, expected.columns.name = "index", "columns" assert_frame_equal(pivoted, expected) # name tracking self.assertEqual(pivoted.index.name, "index") self.assertEqual(pivoted.columns.name, "columns") # don't specify values pivoted = frame.pivot(index="index", columns="columns") self.assertEqual(pivoted.index.name, "index") self.assertEqual(pivoted.columns.names, (None, "columns")) # pivot multiple columns wp = tm.makePanel() lp = wp.to_frame() df = lp.reset_index() assert_frame_equal(df.pivot("major", "minor"), lp.unstack())
def test_pivot(self): data = { 'index': ['A', 'B', 'C', 'C', 'B', 'A'], 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], 'values': [1., 2., 3., 3., 2., 1.] } frame = DataFrame(data) pivoted = frame.pivot( index='index', columns='columns', values='values') expected = DataFrame({ 'One': {'A': 1., 'B': 2., 'C': 3.}, 'Two': {'A': 1., 'B': 2., 'C': 3.} }) expected.index.name, expected.columns.name = 'index', 'columns' tm.assert_frame_equal(pivoted, expected) # name tracking assert pivoted.index.name == 'index' assert pivoted.columns.name == 'columns' # don't specify values pivoted = frame.pivot(index='index', columns='columns') assert pivoted.index.name == 'index' assert pivoted.columns.names == (None, 'columns')
def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan df = DataFrame({'a': ['R1', 'R2', nan, 'R4'], 'b': ['C1', 'C2', 'C3', 'C4'], 'c': [10, 15, 17, 20]}) result = df.pivot('a', 'b', 'c') expected = DataFrame([[nan, nan, 17, nan], [10, nan, nan, nan], [nan, 15, nan, nan], [nan, nan, nan, 20]], index=Index([nan, 'R1', 'R2', 'R4'], name='a'), columns=Index(['C1', 'C2', 'C3', 'C4'], name='b')) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T) # GH9491 df = DataFrame({'a': pd.date_range('2014-02-01', periods=6, freq='D'), 'c': 100 + np.arange(6)}) df['b'] = df['a'] - pd.Timestamp('2014-02-02') df.loc[1, 'a'] = df.loc[3, 'a'] = nan df.loc[1, 'b'] = df.loc[4, 'b'] = nan pv = df.pivot('a', 'b', 'c') self.assertEqual(pv.notnull().values.sum(), len(df)) for _, row in df.iterrows(): self.assertEqual(pv.loc[row['a'], row['b']], row['c']) tm.assert_frame_equal(df.pivot('b', 'a', 'c'), pv.T)
def test_pivot_index_none(self): # gh-3962 data = { 'index': ['A', 'B', 'C', 'C', 'B', 'A'], 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], 'values': [1., 2., 3., 3., 2., 1.] } frame = DataFrame(data).set_index('index') result = frame.pivot(columns='columns', values='values') expected = DataFrame({ 'One': {'A': 1., 'B': 2., 'C': 3.}, 'Two': {'A': 1., 'B': 2., 'C': 3.} }) expected.index.name, expected.columns.name = 'index', 'columns' assert_frame_equal(result, expected) # omit values result = frame.pivot(columns='columns') expected.columns = pd.MultiIndex.from_tuples([('values', 'One'), ('values', 'Two')], names=[None, 'columns']) expected.index.name = 'index' tm.assert_frame_equal(result, expected, check_names=False) assert result.index.name == 'index' assert result.columns.names == (None, 'columns') expected.columns = expected.columns.droplevel(0) result = frame.pivot(columns='columns', values='values') expected.columns.name = 'columns' tm.assert_frame_equal(result, expected)
def test_pivot(self): data = { 'index': ['A', 'B', 'C', 'C', 'B', 'A'], 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], 'values': [1., 2., 3., 3., 2., 1.] } frame = DataFrame(data) pivoted = frame.pivot( index='index', columns='columns', values='values') expected = DataFrame({ 'One': {'A': 1., 'B': 2., 'C': 3.}, 'Two': {'A': 1., 'B': 2., 'C': 3.} }) expected.index.name, expected.columns.name = 'index', 'columns' tm.assert_frame_equal(pivoted, expected) # name tracking assert pivoted.index.name == 'index' assert pivoted.columns.name == 'columns' # don't specify values pivoted = frame.pivot(index='index', columns='columns') assert pivoted.index.name == 'index' assert pivoted.columns.names == (None, 'columns') with catch_warnings(record=True): # pivot multiple columns simplefilter("ignore", FutureWarning) wp = tm.makePanel() lp = wp.to_frame() df = lp.reset_index() tm.assert_frame_equal(df.pivot('major', 'minor'), lp.unstack())
def test_pivot(self): data = { "index": ["A", "B", "C", "C", "B", "A"], "columns": ["One", "One", "One", "Two", "Two", "Two"], "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } frame = DataFrame(data) pivoted = frame.pivot(index="index", columns="columns", values="values") expected = DataFrame( { "One": {"A": 1.0, "B": 2.0, "C": 3.0}, "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, } ) expected.index.name, expected.columns.name = "index", "columns" tm.assert_frame_equal(pivoted, expected) # name tracking assert pivoted.index.name == "index" assert pivoted.columns.name == "columns" # don't specify values pivoted = frame.pivot(index="index", columns="columns") assert pivoted.index.name == "index" assert pivoted.columns.names == (None, "columns")
def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan df = DataFrame({ 'a': ['R1', 'R2', nan, 'R4'], 'b': ['C1', 'C2', 'C3', 'C4'], 'c': [10, 15, 17, 20] }) result = df.pivot('a', 'b', 'c') expected = DataFrame([[nan, nan, 17, nan], [10, nan, nan, nan], [nan, 15, nan, nan], [nan, nan, nan, 20]], index=Index([nan, 'R1', 'R2', 'R4'], name='a'), columns=Index(['C1', 'C2', 'C3', 'C4'], name='b')) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T) # GH9491 df = DataFrame({ 'a': pd.date_range('2014-02-01', periods=6, freq='D'), 'c': 100 + np.arange(6) }) df['b'] = df['a'] - pd.Timestamp('2014-02-02') df.loc[1, 'a'] = df.loc[3, 'a'] = nan df.loc[1, 'b'] = df.loc[4, 'b'] = nan pv = df.pivot('a', 'b', 'c') self.assertEqual(pv.notnull().values.sum(), len(df)) for _, row in df.iterrows(): self.assertEqual(pv.loc[row['a'], row['b']], row['c']) tm.assert_frame_equal(df.pivot('b', 'a', 'c'), pv.T)
def test_pivot_index_none(self): # gh-3962 data = { "index": ["A", "B", "C", "C", "B", "A"], "columns": ["One", "One", "One", "Two", "Two", "Two"], "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } frame = DataFrame(data).set_index("index") result = frame.pivot(columns="columns", values="values") expected = DataFrame( { "One": {"A": 1.0, "B": 2.0, "C": 3.0}, "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, } ) expected.index.name, expected.columns.name = "index", "columns" tm.assert_frame_equal(result, expected) # omit values result = frame.pivot(columns="columns") expected.columns = pd.MultiIndex.from_tuples( [("values", "One"), ("values", "Two")], names=[None, "columns"] ) expected.index.name = "index" tm.assert_frame_equal(result, expected, check_names=False) assert result.index.name == "index" assert result.columns.names == (None, "columns") expected.columns = expected.columns.droplevel(0) result = frame.pivot(columns="columns", values="values") expected.columns.name = "columns" tm.assert_frame_equal(result, expected)
def preprocess(df: pd.DataFrame): # Pivot to create feature columns of level2 prices and volumes df["num"] = df.groupby("datetime").cumcount() + 1 price_pivoted = df.pivot(index="datetime", columns="num", values="price") price_pivoted.columns = "price" + price_pivoted.columns.astype(str) price_pivoted["base"] = (price_pivoted["price10"] + price_pivoted["price11"]) / 2 for n in range( 1, len([c for c in price_pivoted.columns if c.startswith("price")]) + 1): col = "price" + str(n) price_pivoted[col] = price_pivoted[col] - price_pivoted["base"] bid_vol_pivoted = df.pivot(index="datetime", columns="num", values="bid_vol") bid_vol_pivoted.columns = "bid_vol" + bid_vol_pivoted.columns.astype(str) ask_vol_pivoted = df.pivot(index="datetime", columns="num", values="ask_vol") ask_vol_pivoted.columns = "as_vol" + ask_vol_pivoted.columns.astype(str) pivoted = price_pivoted.join(bid_vol_pivoted).join(ask_vol_pivoted) p = Pipeline() return pivoted
def test_pivot(self): data = { 'index': ['A', 'B', 'C', 'C', 'B', 'A'], 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], 'values': [1., 2., 3., 3., 2., 1.] } frame = DataFrame(data) pivoted = frame.pivot( index='index', columns='columns', values='values') expected = DataFrame({ 'One': {'A': 1., 'B': 2., 'C': 3.}, 'Two': {'A': 1., 'B': 2., 'C': 3.} }) expected.index.name, expected.columns.name = 'index', 'columns' tm.assert_frame_equal(pivoted, expected) # name tracking assert pivoted.index.name == 'index' assert pivoted.columns.name == 'columns' # don't specify values pivoted = frame.pivot(index='index', columns='columns') assert pivoted.index.name == 'index' assert pivoted.columns.names == (None, 'columns') with catch_warnings(record=True): # pivot multiple columns wp = tm.makePanel() lp = wp.to_frame() df = lp.reset_index() tm.assert_frame_equal(df.pivot('major', 'minor'), lp.unstack())
def bar(df: pd.DataFrame, legend=False, values="recall", is_fs=True, is_class=True): p1 = df.pivot( index="clf", columns="features", values=values ) if is_class: print(p1) p1.plot.bar(legend=legend, ylim=(0.8, 1.0), figsize=(10, 3), title="{} : classifiers / feature sets".format(values)) ax1 = plt.axes() x_axis = ax1.axes.get_xaxis() x_label = x_axis.get_label() x_label.set_visible(False) plt.show() if is_fs: p2 = df.pivot( index="features", columns="clf", values=values ) print(p2) p2.plot.bar(legend=legend, ylim=(0.8, 1.0), figsize=(10, 3), title="{} : feature sets / classifiers".format(values)) ax1 = plt.axes() x_axis = ax1.axes.get_xaxis() x_label = x_axis.get_label() x_label.set_visible(False) plt.show()
def test_pivot_index_none(self): # gh-3962 data = { "index": ["A", "B", "C", "C", "B", "A"], "columns": ["One", "One", "One", "Two", "Two", "Two"], "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } frame = DataFrame(data).set_index("index") result = frame.pivot(columns="columns", values="values") expected = DataFrame({"One": {"A": 1.0, "B": 2.0, "C": 3.0}, "Two": {"A": 1.0, "B": 2.0, "C": 3.0}}) expected.index.name, expected.columns.name = "index", "columns" assert_frame_equal(result, expected) # omit values result = frame.pivot(columns="columns") expected.columns = pd.MultiIndex.from_tuples([("values", "One"), ("values", "Two")], names=[None, "columns"]) expected.index.name = "index" assert_frame_equal(result, expected, check_names=False) self.assertEqual(result.index.name, "index") self.assertEqual(result.columns.names, (None, "columns")) expected.columns = expected.columns.droplevel(0) data = { "index": range(7), "columns": ["One", "One", "One", "Two", "Two", "Two"], "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } result = frame.pivot(columns="columns", values="values") expected.columns.name = "columns" assert_frame_equal(result, expected)
def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan df = DataFrame({"a": ["R1", "R2", nan, "R4"], "b": ["C1", "C2", "C3", "C4"], "c": [10, 15, 17, 20]}) result = df.pivot("a", "b", "c") expected = DataFrame( [[nan, nan, 17, nan], [10, nan, nan, nan], [nan, 15, nan, nan], [nan, nan, nan, 20]], index=Index([nan, "R1", "R2", "R4"], name="a"), columns=Index(["C1", "C2", "C3", "C4"], name="b"), ) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df.pivot("b", "a", "c"), expected.T) # GH9491 df = DataFrame({"a": pd.date_range("2014-02-01", periods=6, freq="D"), "c": 100 + np.arange(6)}) df["b"] = df["a"] - pd.Timestamp("2014-02-02") df.loc[1, "a"] = df.loc[3, "a"] = nan df.loc[1, "b"] = df.loc[4, "b"] = nan pv = df.pivot("a", "b", "c") self.assertEqual(pv.notnull().values.sum(), len(df)) for _, row in df.iterrows(): self.assertEqual(pv.loc[row["a"], row["b"]], row["c"]) tm.assert_frame_equal(df.pivot("b", "a", "c"), pv.T)
def test_pivot_duplicates(self): data = DataFrame({ "a": ["bar", "bar", "foo", "foo", "foo"], "b": ["one", "two", "one", "one", "two"], "c": [1.0, 2.0, 3.0, 3.0, 4.0], }) with pytest.raises(ValueError, match="duplicate entries"): data.pivot("a", "b", "c")
def test_pivot_duplicates(self): data = DataFrame({ 'a': ['bar', 'bar', 'foo', 'foo', 'foo'], 'b': ['one', 'two', 'one', 'one', 'two'], 'c': [1., 2., 3., 3., 4.] }) with pytest.raises(ValueError, match='duplicate entries'): data.pivot('a', 'b', 'c')
def test_pivot_duplicates(self): data = DataFrame({ 'a': ['bar', 'bar', 'foo', 'foo', 'foo'], 'b': ['one', 'two', 'one', 'one', 'two'], 'c': [1., 2., 3., 3., 4.] }) with tm.assert_raises_regex(ValueError, 'duplicate entries'): data.pivot('a', 'b', 'c')
def test_pivot_duplicates(self): data = DataFrame( { "a": ["bar", "bar", "foo", "foo", "foo"], "b": ["one", "two", "one", "one", "two"], "c": [1.0, 2.0, 3.0, 3.0, 4.0], } ) with assertRaisesRegexp(ValueError, "duplicate entries"): data.pivot("a", "b", "c")
class Pivot: def setup(self): N = 10000 index = date_range('1/1/2000', periods=N, freq='h') data = {'value': np.random.randn(N * 50), 'variable': np.arange(50).repeat(N), 'date': np.tile(index.values, 50)} self.df = DataFrame(data) def time_reshape_pivot_time_series(self): self.df.pivot('date', 'variable', 'value')
class Pivot(object): def setup(self): N = 10000 index = date_range('1/1/2000', periods=N, freq='h') data = {'value': np.random.randn(N * 50), 'variable': np.arange(50).repeat(N), 'date': np.tile(index.values, 50)} self.df = DataFrame(data) def time_reshape_pivot_time_series(self): self.df.pivot('date', 'variable', 'value')
def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan df = DataFrame({'a':['R1', 'R2', nan, 'R4'], 'b':['C1', 'C2', 'C3' , 'C4'], 'c':[10, 15, 17, 20]}) result = df.pivot('a','b','c') expected = DataFrame([[nan,nan,17,nan],[10,nan,nan,nan], [nan,15,nan,nan],[nan,nan,nan,20]], index = Index([nan,'R1','R2','R4'],name='a'), columns = Index(['C1','C2','C3','C4'],name='b')) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T)
class Pivot: def setup(self): N = 10000 index = date_range("1/1/2000", periods=N, freq="h") data = { "value": np.random.randn(N * 50), "variable": np.arange(50).repeat(N), "date": np.tile(index.values, 50), } self.df = DataFrame(data) def time_reshape_pivot_time_series(self): self.df.pivot("date", "variable", "value")
def test_pivot_with_tz(self): # GH 5878 df = DataFrame({ 'dt1': [ datetime(2013, 1, 1, 9, 0), datetime(2013, 1, 2, 9, 0), datetime(2013, 1, 1, 9, 0), datetime(2013, 1, 2, 9, 0) ], 'dt2': [ datetime(2014, 1, 1, 9, 0), datetime(2014, 1, 1, 9, 0), datetime(2014, 1, 2, 9, 0), datetime(2014, 1, 2, 9, 0) ], 'data1': np.arange(4, dtype='int64'), 'data2': np.arange(4, dtype='int64') }) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) exp_col1 = Index(['data1', 'data1', 'data2', 'data2']) exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'] * 2, name='dt2', tz='Asia/Tokyo') exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]], index=pd.DatetimeIndex( ['2013/01/01 09:00', '2013/01/02 09:00'], name='dt1', tz='US/Pacific'), columns=exp_col) pv = df.pivot(index='dt1', columns='dt2') tm.assert_frame_equal(pv, expected) expected = DataFrame( [[0, 2], [1, 3]], index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], name='dt1', tz='US/Pacific'), columns=pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'], name='dt2', tz='Asia/Tokyo')) pv = df.pivot(index='dt1', columns='dt2', values='data1') tm.assert_frame_equal(pv, expected)
def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan df = DataFrame({ 'a': ['R1', 'R2', nan, 'R4'], 'b': ['C1', 'C2', 'C3', 'C4'], 'c': [10, 15, 17, 20] }) result = df.pivot('a', 'b', 'c') expected = DataFrame([[nan, nan, 17, nan], [10, nan, nan, nan], [nan, 15, nan, nan], [nan, nan, nan, 20]], index=Index([nan, 'R1', 'R2', 'R4'], name='a'), columns=Index(['C1', 'C2', 'C3', 'C4'], name='b')) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T)
def pipe_age_filter_entries(self, df: pd.DataFrame) -> pd.DataFrame: """More granular filter. Keep entries where data is deemed reliable. 1. Checks field ALL is equal to sum of all other ages (within 5% error). If not filters rows out. 2. If percentage of unknown doses is above 5% of total doses, filters row out. """ # Find valid dates + location x = df.pivot(index=["date", "location"], columns="age_group", values="total_vaccinations").reset_index() x = x.dropna(subset=AGE_GROUPS_MUST_HAVE, how="any") # Create debug variable (= sum of all ages) x = x.assign(debug_u18=x[AGE_GROUP_UNDERAGE_LEVELS["lvl0"]].fillna(x[ AGE_GROUP_UNDERAGE_LEVELS["lvl1"]].sum(axis=1))) x = x.assign(debug=x[AGE_GROUPS_MUST_HAVE].sum(axis=1) + x.debug_u18) x = x.assign( debug_diff=x.ALL - x.debug, debug_diff_perc=(x.ALL - x.debug) / x.ALL, ) threshold_missmatch_ratio = 0.05 # Keep only those days where missmatch between sum(ages) and total is <5% x = x[x.debug_diff_perc <= threshold_missmatch_ratio] valid_entries_ids = x[["date", "location"]] if not valid_entries_ids.value_counts().max() == 1: raise ValueError("Some entries appear to be duplicated") df = df.merge(valid_entries_ids, on=["date", "location"]) # Filter entries with too many unknown doses (where more 5% of doses are unknown) threshold_unknown_doses_ratio = 0.05 df = df[(df.UnknownDose / df.total_vaccinations) < threshold_unknown_doses_ratio] return df
def print_survival_rate(df): for domain_path, domain_group in df.groupby(["domainPath"]): survival_results = DataFrame(columns="actionDuration algorithmName survival lbound rbound".split()) domain_name = re.search("[^/]+$", domain_path).group(0).rstrip(".track") for fields, action_group in domain_group.groupby(['algorithmName', 'actionDuration']): total_trials = len(action_group) error_experiments = action_group[action_group["errorMessage"].notnull()] deaths = len(error_experiments[error_experiments["errorMessage"] != "Timeout"]) timeouts = len(error_experiments) - deaths successes = len(action_group[~action_group["errorMessage"].notnull()]) survival_confint = proportion_confint(successes, total_trials, 0.05) survival_rate = (successes / (successes + deaths)) survival_results = add_row(survival_results, [fields[1], fields[0], survival_rate, survival_confint[0], survival_confint[1]]) fig, ax = plt.subplots() errors = [] for alg, alg_group in survival_results.groupby('algorithmName'): errors.append([(alg_group['lbound'] - alg_group['survival']).values, (alg_group['rbound'].values - alg_group['survival']).values]) errors = np.abs(errors) print(errors) survival = survival_results.pivot(index='actionDuration', columns='algorithmName', values='survival') survival.plot(ax=ax, yerr=errors, xlim=[0, 7000], ylim=[0, 1.0], capsize=4, capthick=1, ecolor='black', cmap=plt.get_cmap("rainbow"), elinewidth=1) plt.savefig('test.png', format='png')
def main(): data = construct_data_frame(read_data("../output/results.json")) set_rc() data.drop(['commitmentType', "success", "timeLimit", "terminationType", 'timestamp', 'octileMovement', 'lookaheadType', 'firstIterationDuration', 'generatedNodes', 'expandedNodes', 'domainInstanceName', 'domain_name', 'planningTime'], axis=1, inplace=True, errors='ignore') # this is a fix for the traffic domain which does not have domainSeed values, so I have to fake it if 'domainSeed' not in data: data['domainSeed'] = data['domainPath'] data['domainPath'] = 'vehicle' # get min and max ranges for actionDuration for plotting later min_range = data.min()['actionDuration'] max_range = data.max()['actionDuration'] sns.set_style("white") # print_survival_rate(data) data = data[~data['errorMessage'].notnull()] data.sort_values(['domainPath', 'actionDuration'], ascending=True, inplace=True) astar = data[data["algorithmName"] == "A_STAR"] astar["opt"] = astar["actionDuration"] * astar["pathLength"] astar = astar[["domainPath", "domainSeed", "opt", "actionDuration"]] data = pd.merge(data, astar, how='inner', on=['domainPath', 'domainSeed', 'actionDuration']) data["withinOpt"] = data["goalAchievementTime"] / data["opt"] for domain_path, domain_group in data.groupby(["domainPath"]): results = DataFrame(columns="actionDuration algorithmName withinOpt lbound rbound".split()) domain_name = re.search("[^/]+$", domain_path).group(0).rstrip(".track") for fields, action_group in domain_group.groupby(['algorithmName', 'actionDuration']): bound = sms.DescrStatsW(action_group["withinOpt"]).tconfint_mean() mean = action_group["withinOpt"].mean() results = add_row(results, [fields[1], fields[0], mean, abs(mean - bound[0]), abs(mean - bound[1])]) fig, ax = plt.subplots() errors = [] for alg, alg_group in results.groupby('algorithmName'): errors.append([alg_group['lbound'].values, alg_group['rbound'].values]) pivot = results.pivot(index='actionDuration', columns='algorithmName', values='withinOpt') plot = pivot.plot(ax=ax, yerr=errors, capsize=4, capthick=1, ecolor='black', cmap=plt.get_cmap("rainbow"), elinewidth=1) plot.legend(title="Planners", shadow=True, frameon=True, framealpha=1.0, facecolor='lightgrey') format_plot(plot) plt.savefig(domain_name + ".png", format='png')
def _create_heatmap_df( self, df: pd.DataFrame, lookback: int = 5, query: Union[str, dict] = None, fill_null_days: bool = False, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Generates dataframes needed to plot calendar heatmap The method generates two dataframes where one is used to annotate the heatmap and the other is used to apply colors based on the sign dataframe. If there are multiple snapshots per day, the latest one will be selected Parameters ---------- lookback : int Defines how many days to look back at data from the last snapshot query : Union[str, dict] The query to supply to _apply_query If a string, uses the default Pandas query function Else, a dict of lists where the key is column name in the dataframe and the corresponding value is a list of values to keep in the dataframe fill_null_days : bool If True, null values will be generated in the dataframe for days where there is no model snapshot Returns ------- Tuple[pd.DataFrame, pd.DataFrame] Tuple of annotate and sign dataframes """ df = self._apply_query(df, query) required_columns = {"ModelID", "SnapshotTime", "ResponseCount"} assert required_columns.issubset(df.columns) df = (df[["ModelID", "SnapshotTime", "ResponseCount" ]].sort_values("SnapshotTime").reset_index(drop=True)) df["Date"] = pd.Series([i.date() for i in df["SnapshotTime"]]) df = df[df["Date"] > (df["Date"].max() - timedelta(lookback))] if df.shape[0] < 1: raise ValueError(("No data within lookback range")) idx = (df.groupby( ["ModelID", "Date"])["SnapshotTime"].transform(max) == df["SnapshotTime"]) df = df[idx] if fill_null_days: idx_date = pd.date_range(df["Date"].min(), df["Date"].max()) df = ( df.set_index("Date").groupby("ModelID").apply( lambda d: d.reindex(idx_date)) # .drop("ModelID", axis=1) # .reset_index("ModelID") .reset_index().rename(columns={"index": "Date"})) df["Date"] = pd.to_datetime(df["Date"]).dt.date df_annot = df.pivot(columns="Date", values="ResponseCount", index="ModelID") df_sign = self._create_sign_df(df_annot) return (df_annot, df_sign)
def df_to_heatmap_format(df: pd.DataFrame): # pivot the df logging.info("Pivot[ing] age bin df") plot_heatmap = df.pivot(index=[DATE], columns=[AGE_BAND], values=[COUNT])[COUNT].reset_index().fillna(0) logging.info("Pivot[ed] age bin df") logging.info("Transform[ing] df for heatmap") plot_heatmap_trans = plot_heatmap.transpose().copy() # reset df multi index levels plot_heatmap_trans = plot_heatmap_trans.reset_index().rename( columns=plot_heatmap_trans.iloc[0]).drop(0, axis=0) logging.info("Transform[ed] df for heatmap") # convert date values logging.info("Convert[ing] date columns") all_dates = plot_heatmap_trans.columns.to_list()[1:] for col in all_dates: plot_heatmap_trans[col] = plot_heatmap_trans[col].astype(int) logging.info("Convert[ed] date columns") # convert age bin values to sting logging.info("Convert[ing] age bin columns to string") plot_heatmap_trans[AGE_BAND] = plot_heatmap_trans[AGE_BAND].astype( "string") logging.info("Convert[ed] age bin columns to string") return plot_heatmap_trans
def pipe_age_pivot(self, df: pd.DataFrame) -> pd.DataFrame: duplicates = df[df.duplicated(subset=["date", "location", "age_group"])] if len(duplicates) > 0: print(duplicates) raise Exception("There are duplicate combinations of location-date-age_group in the age dataset!") df = df.pivot( index=["location", "date"], columns="age_group", ).reset_index() # Ensure column order columns = pd.MultiIndex.from_tuples(sorted(df.columns, key=lambda x: x[0] + x[1])) df = df[columns] columns_wrong_1 = df.people_vaccinated_per_hundred.columns.difference( df.people_fully_vaccinated_per_hundred.columns ) columns_wrong_2 = df.people_fully_vaccinated_per_hundred.columns.difference( df.people_with_booster_per_hundred.columns ) if columns_wrong_1.any() or columns_wrong_2.any(): raise ValueError( f"There is a mismatch between age groups in people vaccinated and people fully vaccinated" ) return df
def getAdjClosePrices(tickers, startdate, enddate): """ returns a ready to use pandas DataFrame and a Series with the startDate """ # Open DB Connection, TODO: switch to SQLAlchemy db = "/Users/Felix/assetjet.db" conn = sqlite3.connect(db, detect_types=sqlite3.PARSE_DECLTYPES) cursor = conn.cursor() # Query cursor.execute("""CREATE TEMP TABLE Tickers (Cd Text)""") cursor.executemany("""INSERT INTO Tickers VALUES(?)""", zip(tickers)) cursor.execute( """SELECT ts.Cd, Date, AdjClose FROM TimeSeries ts INNER JOIN Tickers t ON ts.Cd = t.Cd WHERE Date >= ? AND Date <= ?""", (startdate, enddate), ) rows = cursor.fetchall() # Create a pandas DataFrame pricesRaw = DataFrame(rows, columns=zip(*cursor.description)[0]) pricesRaw.Date = pd.to_datetime(pricesRaw.Date) # convert date to datetime seriesbegin = pricesRaw[["Cd", "Date"]].groupby("Cd").min() # Pivot DataFrame prices = pricesRaw.pivot("Date", "Cd", "AdjClose") # Close DB and Cursor cursor.close() conn.close() return prices, seriesbegin
def pipe_calculate_metrics(self, df: pd.DataFrame) -> pd.DataFrame: df = df.melt(id_vars="date", var_name="vaccine", value_name="doses") df["dose_number"] = df.vaccine.str.extract(r"(\d+)$").astype(int) df["vaccine"] = df.vaccine.str.replace(r"(\d+)$", "", regex=True) df = df.pivot(index=["date", "vaccine"], columns="dose_number", values="doses").reset_index().fillna(0) # total_vaccinations df["total_vaccinations"] = df[1] + df[2] + df[3] # people_vaccinated df["people_vaccinated"] = df[1] # people_fully_vaccinated df.loc[df.vaccine.isin(self._vax_2d), "people_fully_vaccinated"] = df[2] df.loc[df.vaccine.isin(self._vax_1d), "people_fully_vaccinated"] = df[1] # total_boosters df.loc[df.vaccine.isin(self._vax_2d), "total_boosters"] = df[3] df.loc[df.vaccine.isin(self._vax_1d), "total_boosters"] = df[2] + df[3] df = ( df[["date", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated", "total_boosters"]] .groupby("date", as_index=False) .sum() .sort_values("date") ) df[["total_vaccinations", "people_vaccinated", "people_fully_vaccinated", "total_boosters"]] = ( df[["total_vaccinations", "people_vaccinated", "people_fully_vaccinated", "total_boosters"]] .cumsum() .astype(int) ) return df
def save_output(df: pd.DataFrame, relevant_mac_addr_map: dict, method: str) -> None: df = df.pivot(index="mac_addr", columns="location", values="signal_str") filepath = f"{method} - All APs" df.to_csv(CSV_OUTPUT_FOLDER / f'{filepath}.csv') sns.heatmap(df, annot=True) plt.title(f"{method} - RSSI of ALL APs") plt.savefig(HEATMAP_OUTPUT_FOLDER / f'{filepath}.png', bbox_inches="tight") plt.clf() df.dropna(axis=0, inplace=True) filepath = f"{method} - Non-NaN APs" df.to_csv(CSV_OUTPUT_FOLDER / f'{filepath}.csv') sns.heatmap(df, annot=True) plt.title(f"{method} - RSSI of all Non-NaN APs") plt.savefig(HEATMAP_OUTPUT_FOLDER / f'{filepath}.png', bbox_inches="tight") plt.clf() df = df[df.index.isin(relevant_mac_addr_map.keys())] df = df.rename(index=relevant_mac_addr_map) df = df.sort_values('mac_addr') filepath = f"{method} - Relevant APs" df.to_csv(CSV_OUTPUT_FOLDER / f'{filepath}.csv') sns.heatmap(df, annot=True) plt.title(f"{method} - RSSI of Relevant APs") plt.savefig(HEATMAP_OUTPUT_FOLDER / f'{filepath}.png', bbox_inches="tight") plt.clf()
def generate_expresson_matrix( table_expression: pandas.DataFrame) -> pandas.DataFrame: df = table_expression.pivot(index='locusTag', columns='strain', values='log2FoldChange') return df
def results_long_to_wide(metrics: pd.DataFrame) -> pd.DataFrame: """Adjusts metric resutls from long format to wide.""" # Compute lower and upper bound for confidence interval metrics["conf_int_lower"] = metrics["diff"] - metrics["confidence_interval"] metrics["conf_int_upper"] = metrics["diff"] + metrics["confidence_interval"] # Change experiment variants to upper case metrics = metrics.assign(exp_variant_id=lambda r: r.exp_variant_id.str.upper()) # Reshape metrics DataFrame - from long to wide r = metrics.pivot( index=["exp_id", "exp_variant_id"], columns=["metric_name", "metric_id"], values=["mean", "diff", "conf_int_lower", "conf_int_upper", "p_value"], ) # Add column multiindex names and transpose r.columns.names = ["statistic", "metric_name", "metric_id"] r = r.transpose() # Sort metrics and statistics in the right order r.reset_index(inplace=True) r["metric_id"] = r.apply(_enrich_metric_id, axis="columns") r.sort_values(by="metric_id", inplace=True) r.drop(columns=[("metric_id", "")], inplace=True) # Set index and transpose back r.set_index(["metric_name", "statistic"], inplace=True) r = r.transpose() return r
def make_data_container(feature_data: pd.DataFrame, cluster: pd.Series, sample_metadata: pd.DataFrame, fill_na: bool = True) -> DataContainer: """ Organizes the detected and matched features into a DataContainer. Parameters ---------- feature_data: DataFrame DataFrame obtained from detect_features function. cluster: pd.Series Series obtained from feature_correspondence function. sample_metadata: DataFrame DataFrame with information from each analyzed sample. The index must be the sample names used in feature_data. A column named "class", with the class name of each sample is required. For further data processing run order information in a column named "order" and analytical batch information in a column named "batch" are recommended. fill_na: bool, True If True fill missing values in the data matrix with zeros. Returns ------- DataContainer """ # remove noise feature_data["cluster"] = cluster not_noise = cluster != "noise" feature_data = feature_data[not_noise] # compute aggregate statistics for each feature -> feature metadata estimators = { "mz": ["mean", "std", "min", "max"], "rt": ["mean", "std", "min", "max"] } feature_metadata = feature_data.groupby("cluster").agg(estimators) feature_metadata.columns = _flatten_column_multindex(feature_metadata) feature_metadata.index.name = "feature" # make data matrix data_matrix = feature_data.pivot(index="sample", columns="cluster", values="area") data_matrix.columns.name = "feature" if fill_na: data_matrix = data_matrix.fillna(0) # add samples without features as nan rows missing_index = sample_metadata.index.difference(data_matrix.index) # TODO: manage data inputting missing = pd.DataFrame(data=0, index=missing_index, columns=data_matrix.columns) data_matrix = data_matrix.append(missing) data_matrix = data_matrix.loc[sample_metadata.index, :] dc = DataContainer(data_matrix, feature_metadata, sample_metadata) return dc
def from_pico_stream(df: pd.DataFrame) -> pd.DataFrame: """Packs a channel-by-channel data-stream into a PicoLog PLW Player data dataframe, where each row has temperature measurements across all PicoLogger acquisition channels. For an input data-stream of length num_samples x num_channels, the output dataframe will have shape (num_samples, num_channels). Args: df: PicoLog PLW Player data-stream, where each row has a temperature measurement from a single PicoLogger acquisition channel. Returns: Equivalent packed-dataframe, where each row has temperature measurements across all PicoLogger acquisition channels. index: None (enumeration of entries) columns: `<channel_name>`, ... x num_channels """ # Reindex timestamps with one timestamp per block of channels channels = df['channel'].unique().astype(str) df.index = df.index // len(channels) # Pivot table df = df.pivot(columns='channel', values='temp') return df
def img_seaborn(groupusersdict, labels, values1, values2, values3): labels, values1, values2, values3 = filterzerodata4Three(labels, values1, values2, values3) if not labels: return for la in labels: t = la.strip('\n') if t in groupusersdict.keys(): groupusersdict[t] = 1 namecolvalues =[] commitcolvalues =[] periodvalues = [0 for x in range(len(groupusersdict.keys()))] for name in groupusersdict.keys(): namecolvalues.append(name) commitcolvalues.append(groupusersdict[name]) fig, ax = plt.subplots(figsize=(14, 2)) df = DataFrame({'姓名':namecolvalues, '提交':commitcolvalues,'区间':periodvalues}) result = df.pivot(index='区间', columns='姓名', values='提交') ax = sns.heatmap(result,annot=True, fmt="g",cmap="Greens") ax.set_title("近7天组内提交情况0代表无提交,1代表有提交,3代表特殊情况") plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.savefig(get_resultpath() + '/seaborn.png') plt.close()
def getAdjClosePrices(tickers, startdate, enddate): """ returns a ready to use pandas DataFrame and a Series with the startDate """ # Open DB Connection, TODO: switch to SQLAlchemy db = '/Users/Felix/assetjet.db' conn = sqlite3.connect(db, detect_types=sqlite3.PARSE_DECLTYPES) cursor = conn.cursor() # Query cursor.execute("""CREATE TEMP TABLE Tickers (Cd Text)""") cursor.executemany("""INSERT INTO Tickers VALUES(?)""", zip(tickers)) cursor.execute( """SELECT ts.Cd, Date, AdjClose FROM TimeSeries ts INNER JOIN Tickers t ON ts.Cd = t.Cd WHERE Date >= ? AND Date <= ?""", (startdate, enddate)) rows = cursor.fetchall() # Create a pandas DataFrame pricesRaw = DataFrame(rows, columns=zip(*cursor.description)[0]) pricesRaw.Date = pd.to_datetime(pricesRaw.Date) # convert date to datetime seriesbegin = pricesRaw[['Cd', 'Date']].groupby('Cd').min() # Pivot DataFrame prices = pricesRaw.pivot('Date', 'Cd', 'AdjClose') # Close DB and Cursor cursor.close() conn.close() return prices, seriesbegin
def pipe_pivot(self, df: pd.DataFrame) -> pd.DataFrame: if self.pivot_column is not None and self.pivot_values is not None: return df.pivot( index=[self.location, self.date], columns=self.pivot_column, values=self.pivot_values, ).reset_index() return df
def plot_gat(data, plot_title, file_name): print(f'Data to plot: {data}') data.algorithmName = data.algorithmLabel results = DataFrame( columns="actionDuration withinOpt algorithmName lbound rbound".split()) # rescale action durations to ms data['actionDuration'] = data['actionDuration'] / 1000000 # Change data structure such that goal achievement time is averaged, # grouped by action duration and algorithm for fields, duration_group in data.groupby( ['algorithmName', 'actionDuration']): alg_name = fields[0] if alg_name in alg_map: alg_name = alg_map[alg_name] # Get mean of within optimal calculation, add row to results dataframe mean_within_opt = duration_group['withinOpt'].mean() within_opt_list = list(duration_group['withinOpt']) bound = sms.DescrStatsW(within_opt_list).zconfint_mean() results = add_row(results, [fields[1], mean_within_opt, alg_name, abs(mean_within_opt - bound[0]), abs(mean_within_opt - bound[1])]) errors = [] for alg, alg_group in results.groupby('algorithmName'): errors.append([alg_group['lbound'].values, alg_group['rbound'].values]) pivot = results.pivot(index="actionDuration", columns="algorithmName", values="withinOpt") pivot = pivot[~pivot.index.duplicated(keep='first')] # Below is palette of distinguishable colors for analyzing large sets of algorithms together # colors = ["#90C3D4", "#C390D4", "#D4A190", "#A1D490", "#AB3299", "#AB8132", "#32AB44","#325DAB","#9BAB32", "#32AB7E","#4232AB","#AB325F","#495E49","#49545E","#5E495E", "#5E5449","#FA7887","#C8FA78","#78FAEB","#AA78FA"] palette = sns.color_palette(n_colors=10) plot = pivot.plot(color=palette, title=plot_title, legend=True, yerr=errors, ecolor='black', elinewidth=1, capsize=4, capthick=1) # plot.set_xscale('log') # plot.set_yscale('log') # plot.set_xticks([50, 100, 150, 250, 500, 1000, 2000, 3200]) # plot.set_yticks([1, 1.1, 1.5, 2]) # plot.set_ylim([1, 1.4]) plot.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter()) plot.get_yaxis().set_major_formatter(mpl.ticker.ScalarFormatter()) plot.set_xlabel('Planning Time per Iteration (milliseconds)') plot.set_ylabel('Goal Achievement Time (Factor of Optimal)') plot.legend(title="") pdf = PdfPages("../results/plots/" + file_name + ".pdf") plt.savefig(pdf, format='pdf') pdf.close()
def play_ai(self, rounds=100, print_stats=False): winning_history = [] for i in range(rounds): if i % 100 == 0: print("Rounds {}".format(i)) while not self.gameover: positions = self.potential_moves() player1_action = self.player1.pick_next_move( positions, self.board, self.player_mark) self.make_move(player1_action) board_hash = self.encode_to_key() self.player1.append_board_state(board_hash) win = self.winner() if win is not None: # self.print_board() winning_history.append(win) self.giveReward() self.player1.reset_player() self.player2.reset_player() self.reset_board() break else: positions = self.potential_moves() player2_action = self.player2.pick_next_move( positions, self.board, self.player_mark) self.make_move(player2_action) board_hash = self.encode_to_key() self.player2.append_board_state(board_hash) win = self.winner() if win is not None: winning_history.append(win) # ended with player2 either win or draw self.giveReward() self.player1.reset_player() self.player2.reset_player() self.reset_board() break self.player1.save_state_values() self.player2.save_state_values() if print_stats: winning_series = Series(winning_history) print(winning_series.value_counts(normalize=True, ascending=False)) winning_df = DataFrame({ 'wins': winning_series, 'values': np.ones(len(winning_series)) }) df_to_plot = winning_df.pivot(columns='wins', values='values').fillna(0).cumsum() for c in df_to_plot.columns.values: plt.plot(df_to_plot[c], label=f'Winner:{c}') plt.legend() plt.show()
def tiempos_de_respuesta(datos): df = DataFrame( datos, columns=['pid', 'numero_request', 'tiempo_respuesta', 'status_code']) df = df.pivot(index='numero_request', columns='pid', values='tiempo_respuesta') df.plot() plt.savefig("tiemposDeRespuesta.png")
def test_pivot_with_tz(self): # GH 5878 df = DataFrame( { "dt1": [ datetime(2013, 1, 1, 9, 0), datetime(2013, 1, 2, 9, 0), datetime(2013, 1, 1, 9, 0), datetime(2013, 1, 2, 9, 0), ], "dt2": [ datetime(2014, 1, 1, 9, 0), datetime(2014, 1, 1, 9, 0), datetime(2014, 1, 2, 9, 0), datetime(2014, 1, 2, 9, 0), ], "data1": np.arange(4, dtype="int64"), "data2": np.arange(4, dtype="int64"), } ) df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) exp_col1 = Index(["data1", "data1", "data2", "data2"]) exp_col2 = pd.DatetimeIndex(["2014/01/01 09:00", "2014/01/02 09:00"] * 2, name="dt2", tz="Asia/Tokyo") exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame( [[0, 2, 0, 2], [1, 3, 1, 3]], index=pd.DatetimeIndex(["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific"), columns=exp_col, ) pv = df.pivot(index="dt1", columns="dt2") tm.assert_frame_equal(pv, expected) expected = DataFrame( [[0, 2], [1, 3]], index=pd.DatetimeIndex(["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific"), columns=pd.DatetimeIndex(["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo"), ) pv = df.pivot(index="dt1", columns="dt2", values="data1") tm.assert_frame_equal(pv, expected)
def test_pivot_with_tz(self): # GH 5878 df = DataFrame({'dt1': [datetime(2013, 1, 1, 9, 0), datetime(2013, 1, 2, 9, 0), datetime(2013, 1, 1, 9, 0), datetime(2013, 1, 2, 9, 0)], 'dt2': [datetime(2014, 1, 1, 9, 0), datetime(2014, 1, 1, 9, 0), datetime(2014, 1, 2, 9, 0), datetime(2014, 1, 2, 9, 0)], 'data1': np.arange(4, dtype='int64'), 'data2': np.arange(4, dtype='int64')}) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) exp_col1 = Index(['data1', 'data1', 'data2', 'data2']) exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'] * 2, name='dt2', tz='Asia/Tokyo') exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]], index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], name='dt1', tz='US/Pacific'), columns=exp_col) pv = df.pivot(index='dt1', columns='dt2') tm.assert_frame_equal(pv, expected) expected = DataFrame([[0, 2], [1, 3]], index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], name='dt1', tz='US/Pacific'), columns=pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'], name='dt2', tz='Asia/Tokyo')) pv = df.pivot(index='dt1', columns='dt2', values='data1') tm.assert_frame_equal(pv, expected)
def test_pivot_periods(self): df = DataFrame( { "p1": [ pd.Period("2013-01-01", "D"), pd.Period("2013-01-02", "D"), pd.Period("2013-01-01", "D"), pd.Period("2013-01-02", "D"), ], "p2": [ pd.Period("2013-01", "M"), pd.Period("2013-01", "M"), pd.Period("2013-02", "M"), pd.Period("2013-02", "M"), ], "data1": np.arange(4, dtype="int64"), "data2": np.arange(4, dtype="int64"), } ) exp_col1 = Index(["data1", "data1", "data2", "data2"]) exp_col2 = pd.PeriodIndex(["2013-01", "2013-02"] * 2, name="p2", freq="M") exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame( [[0, 2, 0, 2], [1, 3, 1, 3]], index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"), columns=exp_col, ) pv = df.pivot(index="p1", columns="p2") tm.assert_frame_equal(pv, expected) expected = DataFrame( [[0, 2], [1, 3]], index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"), columns=pd.PeriodIndex(["2013-01", "2013-02"], name="p2", freq="M"), ) pv = df.pivot(index="p1", columns="p2", values="data1") tm.assert_frame_equal(pv, expected)
def GetPricngData(): try: query = "SELECT DISTINCT a.Store AS Store,a.`Top Level Category`,ROUND(AVG(a.Price),2) as CompetitorsPrice, ROUND(b.MyStorePrice,2) as MyStorePrice FROM pricing a JOIN (SELECT DISTINCT `Top Level Category` AS category,AVG(Price) AS MyStorePrice FROM pricing WHERE Store = 'My Store' GROUP BY Store,`Top Level Category`) b ON a.`Top Level Category` = b.category GROUP BY a.Store,a.`Top Level Category`" result = list(ExecuteQuery(query)) df = DataFrame(result) pi = df.pivot(index="Top Level Category", columns="Store", values="CompetitorsPrice") pi[np.isnan(pi)] = 0 pricingResult = {} pricingResult["index"] = pi.index.tolist() pricingResult["columns"] = pi.columns.tolist() pricingResult["values"] = pi.values.tolist() return json.dumps(pricingResult) except Exception as e: return e.message
def test_pivot_empty(self): df = DataFrame({}, columns=["a", "b", "c"]) result = df.pivot("a", "b", "c") expected = DataFrame({}) assert_frame_equal(result, expected, check_names=False)
,['mercredi','temperature',28] ,['mercredi','ensoleillement',4] ,['mercredi','pollution',5] ,['mercredi','pluie',100] ,['jeudi','temperature',28] ,['jeudi','ensoleillement',4] ,['jeudi','pollution',5] ,['jeudi','pluie',100] ,['vendredi','temperature',28] ,['vendredi','ensoleillement',4] ,['vendredi','pollution',5] ,['vendredi','pluie',100] ] cities_data = DataFrame(releves, columns=['jour','attribute','value']) cities_data.pivot('jour','attribute','value') aliments = pd.read_csv('aliments.csv', sep='\t') aliments_with_traces = aliments.ix[aliments.traces.dropna().index] traces_iter = (set(x.split(',')) for x in aliments_with_traces['traces']) traces = set.union(*traces_iter) dummies = DataFrame(np.zeros((len(aliments_with_traces), len(traces))), columns=traces) for i, tr in enumerate(aliments_with_traces.traces): dummies.ix[i, tr.split(',')] = 1 pd.value_counts(pd.qcut(aliments[u'energy_100g'].dropna(),5)) pd.value_counts(pd.cut(aliments[u'energy_100g'].dropna(),5))
mergeM = pd.merge(sales, bonus, on = 'ID') # A many-to-many Join stack = pd.concat([employee, sales], ignore_index = True) # Vertical Stacking ############################################################################################################# # 8. Reshaping & Pivoting ############################################################################################################# df1 = DataFrame([['Big','LAX',3,np.nan],['Big','SFO',6,7],['Med','SEA-TAC',9,np.nan],['Small','POR',np.nan,np.nan]], index=pd.Index(['LA', 'SF', 'SEA', 'POR']), columns=pd.Index(['Type', 'Airport', 'Cool Factor','D'])) # .unstack(): used to convert columns into rows and into a hierarchical index df2 = df1.stack(dropna = False) # converts columns into the child index df3 = df1.unstack() # converts columns into the parent index # .pivot(index, columns, values) is used to reshape data like dplyr in R df4 = df1.pivot('Airport','Type','Cool Factor') # yes! its that easy to reshape! ############################################################################################################# # 9. Outlier Analysis ############################################################################################################# np.random.seed(12345) df = DataFrame(np.random.randn(1000,4)) df.describe() # assume outliers are in the -+3 region df[0][np.abs(df[0])>3] # show all rows in column 0 that are > abs(3) df[(np.abs(df)>3).any(1)] # show all values in the dataframe that are > abs(3) df[np.abs(df)>3] = np.sign(df) * 3 # caps all values > abs(3) to 3; .sign() ############################################################################################################# # 10. Binning Data #############################################################################################################
def test_pivot_duplicates(self): data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'], 'b': ['one', 'two', 'one', 'one', 'two'], 'c': [1., 2., 3., 3., 4.]}) with pytest.raises(ValueError, match='duplicate entries'): data.pivot('a', 'b', 'c')
replacements = {"one": -1, "two": -2} df.fillna(value=replacements) df = DataFrame(array([[1, 3], [1, 2], [3, 2], [2, 1]]), columns=["one", "two"]) df.sort(columns="one") df.sort(columns=["one", "two"]) df.sort(columns=["one", "two"], ascending=[0, 1]) prices = [101.0, 102.0, 103.0] tickers = ["GOOG", "AAPL"] data = [v for v in itertools.product(tickers, prices)] dates = pandas.date_range("2013-01-03", periods=3) df = DataFrame(data, columns=["ticker", "price"]) df["dates"] = dates.append(dates) df df.pivot(index="dates", columns="ticker", values="price") original = DataFrame([[1, 1], [2, 2], [3.0, 3]], index=["a", "b", "c"], columns=["one", "two"]) original.reindex(index=["b", "c", "d"]) different = DataFrame([[1, 1], [2, 2], [3.0, 3]], index=["c", "d", "e"], columns=["one", "two"]) original.reindex_like(different) original.reindex_axis(["two", "one"], axis=1) left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["one", "two"]) right = DataFrame([[1, 2], [3, 4], [7, 8]], columns=["one", "three"]) left.merge(right, on="one") # Same as how='inner' left.merge(right, on="one", how="left") left.merge(right, on="one", how="right") left.merge(right, on="one", how="outer") left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["one", "two"])
def test_pivot_integer_bug(self): df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")]) result = df.pivot(index=1, columns=0, values=2) repr(result) tm.assert_index_equal(result.columns, Index(['A', 'B'], name=0))
def test_pivot_empty(self): df = DataFrame({}, columns=['a', 'b', 'c']) result = df.pivot('a', 'b', 'c') expected = DataFrame({}) tm.assert_frame_equal(result, expected, check_names=False)
def test_pivot_duplicates(self): data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'], 'b': ['one', 'two', 'one', 'one', 'two'], 'c': [1., 2., 3., 3., 4.]}) with tm.assert_raises_regex(ValueError, 'duplicate entries'): data.pivot('a', 'b', 'c')
def test_pivot_integer_bug(self): df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")]) result = df.pivot(index=1, columns=0, values=2) repr(result) self.assert_numpy_array_equal(result.columns, ['A', 'B'])
replacements = {'one':-1, 'two':-2} df.fillna(value=replacements) df = DataFrame(array([[1, 3],[1, 2],[3, 2],[2,1]]), columns=['one','two']) df.sort(columns='one') df.sort(columns=['one','two']) df.sort(columns=['one','two'], ascending=[0,1]) prices = [101.0,102.0,103.0] tickers = ['GOOG','AAPL'] data = [v for v in itertools.product(tickers,prices)] dates = pandas.date_range('2013-01-03',periods=3) df = DataFrame(data, columns=['ticker','price']) df['dates'] = dates.append(dates) df df.pivot(index='dates',columns='ticker',values='price') original = DataFrame([[1,1],[2,2],[3.0,3]],index=['a','b','c'], columns=['one','two']) original.reindex(index=['b','c','d']) different = DataFrame([[1,1],[2,2],[3.0,3]],index=['c','d','e'], columns=['one','two']) original.reindex_like(different) original.reindex_axis(['two','one'], axis = 1) left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two']) right = DataFrame([[1,2],[3,4],[7,8]],columns=['one','three']) left.merge(right,on='one') # Same as how='inner' left.merge(right,on='one', how='left') left.merge(right,on='one', how='right') left.merge(right,on='one', how='outer') left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e']) data2 = pd.concat([s1, s2], keys=['one', 'two']) # print data2 # print data2.unstack() # print data2.unstack().stack() # print data2.unstack().stack(dropna=False) df = DataFrame({'left': result, 'right': result+5}, columns=pd.Index(['left', 'right'], name='side')) # print df # print df.unstack('state') # print df.unstack('state').stack('side') xls_file = pd.ExcelFile('C:\\Users\\Administrator\\Desktop\\data.xlsx') ldata = DataFrame(xls_file.parse('Sheet1')) pivoted = ldata.pivot('date', 'item', 'value') # print ldata[:10] # print pivoted.head() ldata["value2"] = np.random.randn(len(ldata)) # print ldata[:10] pivoted = ldata.pivot('date', 'item') # print pivoted[:5] # print pivoted['value'][:3] unstacked = ldata.set_index(['date', 'item']).unstack('item') # print unstacked[:7] data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]}) # print data