def test_drop_names(self): df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], index=['a', 'b', 'c'], columns=['d', 'e', 'f']) df.index.name, df.columns.name = 'first', 'second' df_dropped_b = df.drop('b') df_dropped_e = df.drop('e', axis=1) df_inplace_b, df_inplace_e = df.copy(), df.copy() df_inplace_b.drop('b', inplace=True) df_inplace_e.drop('e', axis=1, inplace=True) for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): self.assertEqual(obj.index.name, 'first') self.assertEqual(obj.columns.name, 'second') self.assertEqual(list(df.columns), ['d', 'e', 'f']) self.assertRaises(ValueError, df.drop, ['g']) self.assertRaises(ValueError, df.drop, ['g'], 1) # errors = 'ignore' dropped = df.drop(['g'], errors='ignore') expected = Index(['a', 'b', 'c'], name='first') self.assert_index_equal(dropped.index, expected) dropped = df.drop(['b', 'g'], errors='ignore') expected = Index(['a', 'c'], name='first') self.assert_index_equal(dropped.index, expected) dropped = df.drop(['g'], axis=1, errors='ignore') expected = Index(['d', 'e', 'f'], name='second') self.assert_index_equal(dropped.columns, expected) dropped = df.drop(['d', 'g'], axis=1, errors='ignore') expected = Index(['e', 'f'], name='second') self.assert_index_equal(dropped.columns, expected)
def test_drop_names(self): df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], index=["a", "b", "c"], columns=["d", "e", "f"]) df.index.name, df.columns.name = "first", "second" df_dropped_b = df.drop("b") df_dropped_e = df.drop("e", axis=1) df_inplace_b, df_inplace_e = df.copy(), df.copy() df_inplace_b.drop("b", inplace=True) df_inplace_e.drop("e", axis=1, inplace=True) for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): self.assertEqual(obj.index.name, "first") self.assertEqual(obj.columns.name, "second") self.assertEqual(list(df.columns), ["d", "e", "f"]) self.assertRaises(ValueError, df.drop, ["g"]) self.assertRaises(ValueError, df.drop, ["g"], 1) # errors = 'ignore' dropped = df.drop(["g"], errors="ignore") expected = Index(["a", "b", "c"], name="first") self.assert_index_equal(dropped.index, expected) dropped = df.drop(["b", "g"], errors="ignore") expected = Index(["a", "c"], name="first") self.assert_index_equal(dropped.index, expected) dropped = df.drop(["g"], axis=1, errors="ignore") expected = Index(["d", "e", "f"], name="second") self.assert_index_equal(dropped.columns, expected) dropped = df.drop(["d", "g"], axis=1, errors="ignore") expected = Index(["e", "f"], name="second") self.assert_index_equal(dropped.columns, expected)
def test_append_empty_dataframe(self): # Empty df append empty df df1 = DataFrame([]) df2 = DataFrame([]) result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected) # Non-empty df append empty df df1 = DataFrame(np.random.randn(5, 2)) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected) # Empty df with columns append empty df df1 = DataFrame(columns=['bar', 'foo']) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected) # Non-Empty df with columns append empty df df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo']) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected)
def test_assign(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) original = df.copy() result = df.assign(C=df.B / df.A) expected = df.copy() expected['C'] = [4, 2.5, 2] assert_frame_equal(result, expected) # lambda syntax result = df.assign(C=lambda x: x.B / x.A) assert_frame_equal(result, expected) # original is unmodified assert_frame_equal(df, original) # Non-Series array-like result = df.assign(C=[4, 2.5, 2]) assert_frame_equal(result, expected) # original is unmodified assert_frame_equal(df, original) result = df.assign(B=df.B / df.A) expected = expected.drop('B', axis=1).rename(columns={'C': 'B'}) assert_frame_equal(result, expected) # overwrite result = df.assign(A=df.A + df.B) expected = df.copy() expected['A'] = [5, 7, 9] assert_frame_equal(result, expected) # lambda result = df.assign(A=lambda x: x.A + x.B) assert_frame_equal(result, expected)
class Equals(object): def setup(self): N = 10**3 self.float_df = DataFrame(np.random.randn(N, N)) self.float_df_nan = self.float_df.copy() self.float_df_nan.iloc[-1, -1] = np.nan self.object_df = DataFrame('foo', index=range(N), columns=range(N)) self.object_df_nan = self.object_df.copy() self.object_df_nan.iloc[-1, -1] = np.nan self.nonunique_cols = self.object_df.copy() self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns) self.nonunique_cols_nan = self.nonunique_cols.copy() self.nonunique_cols_nan.iloc[-1, -1] = np.nan def time_frame_float_equal(self): self.float_df.equals(self.float_df) def time_frame_float_unequal(self): self.float_df.equals(self.float_df_nan) def time_frame_nonunique_equal(self): self.nonunique_cols.equals(self.nonunique_cols) def time_frame_nonunique_unequal(self): self.nonunique_cols.equals(self.nonunique_cols_nan) def time_frame_object_equal(self): self.object_df.equals(self.object_df) def time_frame_object_unequal(self): self.object_df.equals(self.object_df_nan)
def test_stack_mixed_levels(self): columns = MultiIndex.from_tuples( [("A", "cat", "long"), ("B", "cat", "long"), ("A", "dog", "short"), ("B", "dog", "short")], names=["exp", "animal", "hair_length"], ) df = DataFrame(randn(4, 4), columns=columns) animal_hair_stacked = df.stack(level=["animal", "hair_length"]) exp_hair_stacked = df.stack(level=["exp", "hair_length"]) # GH #8584: Need to check that stacking works when a number # is passed that is both a level name and in the range of # the level numbers df2 = df.copy() df2.columns.names = ["exp", "animal", 1] assert_frame_equal(df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False) assert_frame_equal(df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False) # When mixed types are passed and the ints are not level # names, raise self.assertRaises(ValueError, df2.stack, level=["animal", 0]) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth df3 = df.copy() df3.columns.names = ["exp", "animal", 0] assert_frame_equal(df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False)
def test_dropna(self): df = DataFrame(np.random.randn(6, 4)) df[2][:2] = np.nan dropped = df.dropna(axis=1) expected = df.loc[:, [0, 1, 3]] inp = df.copy() inp.dropna(axis=1, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=0) expected = df.loc[lrange(2, 6)] inp = df.copy() inp.dropna(axis=0, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) # threshold dropped = df.dropna(axis=1, thresh=5) expected = df.loc[:, [0, 1, 3]] inp = df.copy() inp.dropna(axis=1, thresh=5, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=0, thresh=4) expected = df.loc[lrange(2, 6)] inp = df.copy() inp.dropna(axis=0, thresh=4, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=1, thresh=4) assert_frame_equal(dropped, df) dropped = df.dropna(axis=1, thresh=3) assert_frame_equal(dropped, df) # subset dropped = df.dropna(axis=0, subset=[0, 1, 3]) inp = df.copy() inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) assert_frame_equal(dropped, df) assert_frame_equal(inp, df) # all dropped = df.dropna(axis=1, how='all') assert_frame_equal(dropped, df) df[2] = np.nan dropped = df.dropna(axis=1, how='all') expected = df.loc[:, [0, 1, 3]] assert_frame_equal(dropped, expected) # bad input msg = ("No axis named 3 for object type" " <class 'pandas.core.frame.DataFrame'>") with pytest.raises(ValueError, match=msg): df.dropna(axis=3)
def test_stack_mixed_levels(self): columns = MultiIndex.from_tuples( [('A', 'cat', 'long'), ('B', 'cat', 'long'), ('A', 'dog', 'short'), ('B', 'dog', 'short')], names=['exp', 'animal', 'hair_length'] ) df = DataFrame(randn(4, 4), columns=columns) animal_hair_stacked = df.stack(level=['animal', 'hair_length']) exp_hair_stacked = df.stack(level=['exp', 'hair_length']) # GH #8584: Need to check that stacking works when a number # is passed that is both a level name and in the range of # the level numbers df2 = df.copy() df2.columns.names = ['exp', 'animal', 1] assert_frame_equal(df2.stack(level=['animal', 1]), animal_hair_stacked, check_names=False) assert_frame_equal(df2.stack(level=['exp', 1]), exp_hair_stacked, check_names=False) # When mixed types are passed and the ints are not level # names, raise pytest.raises(ValueError, df2.stack, level=['animal', 0]) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth df3 = df.copy() df3.columns.names = ['exp', 'animal', 0] assert_frame_equal(df3.stack(level=['animal', 0]), animal_hair_stacked, check_names=False)
def test_stack_int_level_names(self): columns = MultiIndex.from_tuples( [('A', 'cat', 'long'), ('B', 'cat', 'long'), ('A', 'dog', 'short'), ('B', 'dog', 'short')], names=['exp', 'animal', 'hair_length'] ) df = DataFrame(randn(4, 4), columns=columns) exp_animal_stacked = df.stack(level=['exp', 'animal']) animal_hair_stacked = df.stack(level=['animal', 'hair_length']) exp_hair_stacked = df.stack(level=['exp', 'hair_length']) df2 = df.copy() df2.columns.names = [0, 1, 2] assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False) assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False) assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False) # Out-of-order int column names df3 = df.copy() df3.columns.names = [2, 0, 1] assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False) assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False) assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False)
def test_interp_alt_scipy(self): tm._skip_if_no_scipy() df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], 'C': [1, 2, 3, 5, 8, 13, 21]}) result = df.interpolate(method='barycentric') expected = df.copy() expected.loc[2, 'A'] = 3 expected.loc[5, 'A'] = 6 assert_frame_equal(result, expected) result = df.interpolate(method='barycentric', downcast='infer') assert_frame_equal(result, expected.astype(np.int64)) result = df.interpolate(method='krogh') expectedk = df.copy() expectedk['A'] = expected['A'] assert_frame_equal(result, expectedk) _skip_if_no_pchip() import scipy result = df.interpolate(method='pchip') expected.loc[2, 'A'] = 3 if LooseVersion(scipy.__version__) >= '0.17.0': expected.loc[5, 'A'] = 6.0 else: expected.loc[5, 'A'] = 6.125 assert_frame_equal(result, expected)
def test_interp_alt_scipy(self): tm._skip_if_no_scipy() df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], 'C': [1, 2, 3, 5, 8, 13, 21]}) result = df.interpolate(method='barycentric') expected = df.copy() expected['A'].iloc[2] = 3 expected['A'].iloc[5] = 6 assert_frame_equal(result, expected) result = df.interpolate(method='barycentric', downcast='infer') assert_frame_equal(result, expected.astype(np.int64)) result = df.interpolate(method='krogh') expectedk = df.copy() # expectedk['A'].iloc[2] = 3 # expectedk['A'].iloc[5] = 6 expectedk['A'] = expected['A'] assert_frame_equal(result, expectedk) _skip_if_no_pchip() result = df.interpolate(method='pchip') expected['A'].iloc[2] = 3 expected['A'].iloc[5] = 6.125 assert_frame_equal(result, expected)
class Append(object): goal_time = 0.2 def setup(self): self.df1 = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) self.df2 = self.df1.copy() self.df2.index = np.arange(10000, 20000) self.mdf1 = self.df1.copy() self.mdf1['obj1'] = 'bar' self.mdf1['obj2'] = 'bar' self.mdf1['int1'] = 5 try: with warnings.catch_warnings(record=True): self.mdf1.consolidate(inplace=True) except: pass self.mdf2 = self.mdf1.copy() self.mdf2.index = self.df2.index def time_append_homogenous(self): self.df1.append(self.df2) def time_append_mixed(self): self.mdf1.append(self.mdf2)
def test_frame_to_period(self): K = 5 from pandas.tseries.period import period_range dr = date_range('1/1/2000', '1/1/2001') pr = period_range('1/1/2000', '1/1/2001') df = DataFrame(randn(len(dr), K), index=dr) df['mix'] = 'a' pts = df.to_period() exp = df.copy() exp.index = pr assert_frame_equal(pts, exp) pts = df.to_period('M') tm.assert_index_equal(pts.index, exp.index.asfreq('M')) df = df.T pts = df.to_period(axis=1) exp = df.copy() exp.columns = pr assert_frame_equal(pts, exp) pts = df.to_period('M', axis=1) tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) self.assertRaises(ValueError, df.to_period, axis=2)
def test_to_period(self): from pandas.tseries.period import period_range ts = _simple_ts('1/1/2000', '1/1/2001') pts = ts.to_period() exp = ts.copy() exp.index = period_range('1/1/2000', '1/1/2001') assert_series_equal(pts, exp) pts = ts.to_period('M') exp.index = exp.index.asfreq('M') tm.assert_index_equal(pts.index, exp.index.asfreq('M')) assert_series_equal(pts, exp) # GH 7606 without freq idx = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04']) exp_idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04'], freq='D') s = Series(np.random.randn(4), index=idx) expected = s.copy() expected.index = exp_idx assert_series_equal(s.to_period(), expected) df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx) expected = df.copy() expected.index = exp_idx assert_frame_equal(df.to_period(), expected) expected = df.copy() expected.columns = exp_idx assert_frame_equal(df.to_period(axis=1), expected)
def test_copy(self): """Check inplace/copy behavior of link_df, link_df_iter""" # One 1D stepper N = 5 f = DataFrame({'x': np.arange(N), 'y': np.ones(N), 'frame': np.arange(N)}) f_inplace = f.copy() expected = f.copy() expected['particle'] = np.zeros(N) # Should add particle column in-place # UNLESS diagnostics are enabled (or input dataframe is not writeable) actual = self.link_df(f_inplace, 5) assert_frame_equal(actual, expected) if self.do_diagnostics: assert 'particle' not in f_inplace.columns else: assert_frame_equal(actual, f_inplace) # When DataFrame is actually a view, link_df should produce a warning # and then copy the DataFrame. This only happens for pandas >= 0.16. if is_pandas_recent: with assert_produces_warning(UserWarning): actual = self.link_df(f[f['frame'] > 0], 5) assert 'particle' not in f.columns # Should copy actual = self.link_df(f, 5, copy_features=True) assert_frame_equal(actual, expected) assert 'particle' not in f.columns # Should copy actual_iter = self.link_df_iter(f, 5, hash_size=(10, 2)) assert_frame_equal(actual_iter, expected) assert 'particle' not in f.columns
def test_copy(self): """Check inplace/copy behavior of link_df, link_df_iter""" # One 1D stepper N = 5 f = DataFrame({'x': np.arange(N), 'y': np.ones(N), 'frame': np.arange(N)}) f_inplace = f.copy() expected = f.copy() expected['particle'] = np.zeros(N) # Should add particle column in-place # UNLESS diagnostics are enabled (or input dataframe is not writeable) actual = self.link_df(f_inplace, 5) assert_traj_equal(actual, expected) if self.do_diagnostics: assert 'particle' not in f_inplace.columns else: assert_traj_equal(actual, f_inplace) # Should copy actual = self.link_df(f, 5, copy_features=True) assert_traj_equal(actual, expected) assert 'particle' not in f.columns # Should copy actual_iter = self.link_df_iter(f, 5, hash_size=(10, 2)) assert_traj_equal(actual_iter, expected) assert 'particle' not in f.columns
def test_frame_to_period(self): K = 5 dr = date_range('1/1/2000', '1/1/2001') pr = period_range('1/1/2000', '1/1/2001') df = DataFrame(np.random.randn(len(dr), K), index=dr) df['mix'] = 'a' pts = df.to_period() exp = df.copy() exp.index = pr assert_frame_equal(pts, exp) pts = df.to_period('M') tm.assert_index_equal(pts.index, exp.index.asfreq('M')) df = df.T pts = df.to_period(axis=1) exp = df.copy() exp.columns = pr assert_frame_equal(pts, exp) pts = df.to_period('M', axis=1) tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) msg = ("No axis named 2 for object type" " <class 'pandas.core.frame.DataFrame'>") with pytest.raises(ValueError, match=msg): df.to_period(axis=2)
def test_setitem_with_datetime_tz(self): # 16889 # support .loc with alignment and tz-aware DatetimeIndex mask = np.array([True, False, True, False]) idx = date_range('20010101', periods=4, tz='UTC') df = DataFrame({'a': np.arange(4)}, index=idx).astype('float64') result = df.copy() result.loc[mask, :] = df.loc[mask, :] tm.assert_frame_equal(result, df) result = df.copy() result.loc[mask] = df.loc[mask] tm.assert_frame_equal(result, df) idx = date_range('20010101', periods=4) df = DataFrame({'a': np.arange(4)}, index=idx).astype('float64') result = df.copy() result.loc[mask, :] = df.loc[mask, :] tm.assert_frame_equal(result, df) result = df.copy() result.loc[mask] = df.loc[mask] tm.assert_frame_equal(result, df)
def test_frame_setitem_multi_column(self): df = DataFrame(randn(10, 4), columns=[['a', 'a', 'b', 'b'], [0, 1, 0, 1]]) cp = df.copy() cp['a'] = cp['b'] tm.assert_frame_equal(cp['a'], cp['b']) # set with ndarray cp = df.copy() cp['a'] = cp['b'].values tm.assert_frame_equal(cp['a'], cp['b']) # --------------------------------------- # #1803 columns = MultiIndex.from_tuples([('A', '1'), ('A', '2'), ('B', '1')]) df = DataFrame(index=[1, 3, 5], columns=columns) # Works, but adds a column instead of updating the two existing ones df['A'] = 0.0 # Doesn't work assert (df['A'].values == 0).all() # it broadcasts df['B', '1'] = [1, 2, 3] df['A'] = df['B', '1'] sliced_a1 = df['A', '1'] sliced_a2 = df['A', '2'] sliced_b1 = df['B', '1'] tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False) tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False) assert sliced_a1.name == ('A', '1') assert sliced_a2.name == ('A', '2') assert sliced_b1.name == ('B', '1')
def test_dropna(self): df = DataFrame(np.random.randn(6, 4)) df[2][:2] = nan dropped = df.dropna(axis=1) expected = df.loc[:, [0, 1, 3]] inp = df.copy() inp.dropna(axis=1, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=0) expected = df.loc[lrange(2, 6)] inp = df.copy() inp.dropna(axis=0, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) # threshold dropped = df.dropna(axis=1, thresh=5) expected = df.loc[:, [0, 1, 3]] inp = df.copy() inp.dropna(axis=1, thresh=5, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=0, thresh=4) expected = df.loc[lrange(2, 6)] inp = df.copy() inp.dropna(axis=0, thresh=4, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=1, thresh=4) assert_frame_equal(dropped, df) dropped = df.dropna(axis=1, thresh=3) assert_frame_equal(dropped, df) # subset dropped = df.dropna(axis=0, subset=[0, 1, 3]) inp = df.copy() inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) assert_frame_equal(dropped, df) assert_frame_equal(inp, df) # all dropped = df.dropna(axis=1, how='all') assert_frame_equal(dropped, df) df[2] = nan dropped = df.dropna(axis=1, how='all') expected = df.loc[:, [0, 1, 3]] assert_frame_equal(dropped, expected) # bad input pytest.raises(ValueError, df.dropna, axis=3)
def test_inplace_ops_identity(self): # GH 5104 # make sure that we are actually changing the object s_orig = Series([1, 2, 3]) df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5)) # no dtype change s = s_orig.copy() s2 = s s += 1 assert_series_equal(s, s2) assert_series_equal(s_orig + 1, s) self.assertIs(s, s2) self.assertIs(s._data, s2._data) df = df_orig.copy() df2 = df df += 1 assert_frame_equal(df, df2) assert_frame_equal(df_orig + 1, df) self.assertIs(df, df2) self.assertIs(df._data, df2._data) # dtype change s = s_orig.copy() s2 = s s += 1.5 assert_series_equal(s, s2) assert_series_equal(s_orig + 1.5, s) df = df_orig.copy() df2 = df df += 1.5 assert_frame_equal(df, df2) assert_frame_equal(df_orig + 1.5, df) self.assertIs(df, df2) self.assertIs(df._data, df2._data) # mixed dtype arr = np.random.randint(0, 10, size=5) df_orig = DataFrame({"A": arr.copy(), "B": "foo"}) df = df_orig.copy() df2 = df df["A"] += 1 expected = DataFrame({"A": arr.copy() + 1, "B": "foo"}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) self.assertIs(df._data, df2._data) df = df_orig.copy() df2 = df df["A"] += 1.5 expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) self.assertIs(df._data, df2._data)
def test_inplace_ops_identity(self): # GH 5104 # make sure that we are actually changing the object s_orig = Series([1, 2, 3]) df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5)) # no dtype change s = s_orig.copy() s2 = s s += 1 assert_series_equal(s, s2) assert_series_equal(s_orig + 1, s) assert s is s2 assert s._data is s2._data df = df_orig.copy() df2 = df df += 1 assert_frame_equal(df, df2) assert_frame_equal(df_orig + 1, df) assert df is df2 assert df._data is df2._data # dtype change s = s_orig.copy() s2 = s s += 1.5 assert_series_equal(s, s2) assert_series_equal(s_orig + 1.5, s) df = df_orig.copy() df2 = df df += 1.5 assert_frame_equal(df, df2) assert_frame_equal(df_orig + 1.5, df) assert df is df2 assert df._data is df2._data # mixed dtype arr = np.random.randint(0, 10, size=5) df_orig = DataFrame({'A': arr.copy(), 'B': 'foo'}) df = df_orig.copy() df2 = df df['A'] += 1 expected = DataFrame({'A': arr.copy() + 1, 'B': 'foo'}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) assert df._data is df2._data df = df_orig.copy() df2 = df df['A'] += 1.5 expected = DataFrame({'A': arr.copy() + 1.5, 'B': 'foo'}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) assert df._data is df2._data
def test_regex_replace_dict_mixed(self): mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']} dfmix = DataFrame(mix) # dicts # single dict {re1: v1}, search the whole frame # need test for this... # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole # frame res = dfmix.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True) res2 = dfmix.copy() res2.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, inplace=True, regex=True) expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', np.nan, np.nan], 'c': mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the # whole frame res = dfmix.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) res2 = dfmix.copy() res2.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, inplace=True, regex=True) expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c': mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) res = dfmix.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}) res2 = dfmix.copy() res2.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}, inplace=True) expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c': mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) # scalar -> dict # to_replace regex, {value: value} expec = DataFrame({'a': mix['a'], 'b': [np.nan, 'b', '.', '.'], 'c': mix['c']}) res = dfmix.replace('a', {'b': np.nan}, regex=True) res2 = dfmix.copy() res2.replace('a', {'b': np.nan}, regex=True, inplace=True) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) res = dfmix.replace('a', {'b': np.nan}, regex=True) res2 = dfmix.copy() res2.replace(regex='a', value={'b': np.nan}, inplace=True) expec = DataFrame({'a': mix['a'], 'b': [np.nan, 'b', '.', '.'], 'c': mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec)
def test_interp_inplace(self): df = DataFrame({'a': [1., 2., np.nan, 4.]}) expected = DataFrame({'a': [1., 2., 3., 4.]}) result = df.copy() result['a'].interpolate(inplace=True) assert_frame_equal(result, expected) result = df.copy() result['a'].interpolate(inplace=True, downcast='infer') assert_frame_equal(result, expected.astype('int64'))
def test_replace_datetimetz(self): # GH 11326 # behaving poorly when presented with a datetime64[ns, tz] df = DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [0, np.nan, 2]}) result = df.replace(np.nan, 1) expected = DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': Series([0, 1, 2], dtype='float64')}) assert_frame_equal(result, expected) result = df.fillna(1) assert_frame_equal(result, expected) result = df.replace(0, np.nan) expected = DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [np.nan, np.nan, 2]}) assert_frame_equal(result, expected) result = df.replace(Timestamp('20130102', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern')) expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern'), Timestamp('20130103', tz='US/Eastern')], 'B': [0, np.nan, 2]}) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace( {'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern')) assert_frame_equal(result, expected) # coerce to object result = df.copy() result.iloc[1, 0] = np.nan result = result.replace( {'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific')) expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Pacific'), Timestamp('20130103', tz='US/Eastern')], 'B': [0, np.nan, 2]}) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': np.nan}, Timestamp('20130104')) expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104'), Timestamp('20130103', tz='US/Eastern')], 'B': [0, np.nan, 2]}) assert_frame_equal(result, expected)
def test_replace_input_formats(self): # both dicts to_rep = {'A': np.nan, 'B': 0, 'C': ''} values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) filled = df.replace(to_rep, values) expected = {} for k, v in compat.iteritems(df): expected[k] = v.replace(to_rep[k], values[k]) assert_frame_equal(filled, DataFrame(expected)) result = df.replace([0, 2, 5], [5, 2, 0]) expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0], 'C': ['', 'asdf', 'fd']}) assert_frame_equal(result, expected) # dict to scalar filled = df.replace(to_rep, 0) expected = {} for k, v in compat.iteritems(df): expected[k] = v.replace(to_rep[k], 0) assert_frame_equal(filled, DataFrame(expected)) self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, '']) # scalar to dict values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) filled = df.replace(np.nan, values) expected = {} for k, v in compat.iteritems(df): expected[k] = v.replace(np.nan, values[k]) assert_frame_equal(filled, DataFrame(expected)) # list to list to_rep = [np.nan, 0, ''] values = [-2, -1, 'missing'] result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], values[i], inplace=True) assert_frame_equal(result, expected) self.assertRaises(ValueError, df.replace, to_rep, values[1:]) # list to scalar to_rep = [np.nan, 0, ''] result = df.replace(to_rep, -1) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], -1, inplace=True) assert_frame_equal(result, expected)
def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df df = DataFrame({'FC': ['a', 'b', 'a', 'b', 'a', 'b'], 'PF': [0, 0, 0, 0, 1, 1], 'col1': list(range(6)), 'col2': list(range(6, 12)), }) df.iloc[1, 0] = np.nan df2 = df.copy() mask = ~df2.FC.isna() cols = ['col1', 'col2'] dft = df2 * 2 dft.iloc[3, 3] = np.nan expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], 'PF': [0, 0, 0, 0, 1, 1], 'col1': Series([0, 1, 4, 6, 8, 10]), 'col2': [12, 7, 16, np.nan, 20, 22]}) # frame on rhs df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) # with an ndarray on rhs # coerces to float64 because values has float64 dtype # GH 14001 expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], 'PF': [0, 0, 0, 0, 1, 1], 'col1': [0., 1., 4., 6., 8., 10.], 'col2': [12, 7, 16, np.nan, 20, 22]}) df2 = df.copy() df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) # broadcasting on the rhs is required df = DataFrame(dict(A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[ 0, 0, 0, 10, 11], D=[3, 4, 5, 6, 7])) expected = df.copy() mask = expected['A'] == 0 for col in ['A', 'B']: expected.loc[mask, col] = df['D'] df.loc[df['A'] == 0, ['A', 'B']] = df['D'] tm.assert_frame_equal(df, expected)
def test_ix_loc_setitem_consistency(self): # GH 5771 # loc with slice and series s = Series(0, index=[4, 5, 6]) s.loc[4:5] += 1 expected = Series([1, 1, 0], index=[4, 5, 6]) tm.assert_series_equal(s, expected) # GH 5928 # chained indexing assignment df = DataFrame({'a': [0, 1, 2]}) expected = df.copy() with catch_warnings(record=True): expected.ix[[0, 1, 2], 'a'] = -expected.ix[[0, 1, 2], 'a'] with catch_warnings(record=True): df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]] tm.assert_frame_equal(df, expected) df = DataFrame({'a': [0, 1, 2], 'b': [0, 1, 2]}) with catch_warnings(record=True): df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]].astype( 'float64') + 0.5 expected = DataFrame({'a': [0.5, -0.5, -1.5], 'b': [0, 1, 2]}) tm.assert_frame_equal(df, expected) # GH 8607 # ix setitem consistency df = DataFrame({'delta': [1174, 904, 161], 'elapsed': [7673, 9277, 1470], 'timestamp': [1413840976, 1413842580, 1413760580]}) expected = DataFrame({'delta': [1174, 904, 161], 'elapsed': [7673, 9277, 1470], 'timestamp': pd.to_datetime( [1413840976, 1413842580, 1413760580], unit='s') }) df2 = df.copy() df2['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') tm.assert_frame_equal(df2, expected) df2 = df.copy() df2.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], unit='s') tm.assert_frame_equal(df2, expected) df2 = df.copy() with catch_warnings(record=True): df2.ix[:, 2] = pd.to_datetime(df['timestamp'], unit='s') tm.assert_frame_equal(df2, expected)
def test_regex_replace_regex_list_to_numeric(self): mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} df = DataFrame(mix) res = df.replace([r'\s*\.\s*', 'b'], 0, regex=True) res2 = df.copy() res2.replace([r'\s*\.\s*', 'b'], 0, regex=True, inplace=True) res3 = df.copy() res3.replace(regex=[r'\s*\.\s*', 'b'], value=0, inplace=True) expec = DataFrame({'a': mix['a'], 'b': ['a', 0, 0, 0], 'c': ['a', 0, nan, 'd']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec)
def test_regex_replace_str_to_numeric(self): # what happens when you try to replace a numeric value with a regex? mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} df = DataFrame(mix) res = df.replace(r'\s*\.\s*', 0, regex=True) res2 = df.copy() res2.replace(r'\s*\.\s*', 0, inplace=True, regex=True) res3 = df.copy() res3.replace(regex=r'\s*\.\s*', value=0, inplace=True) expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', 0, 0], 'c': mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec)
def test_combine_first_mixed_bug(self): idx = Index(['a', 'b', 'c', 'e']) ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) ser2 = Series(['a', 'b', 'c', 'e'], index=idx) ser3 = Series([12, 4, 5, 97], index=idx) frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3}) idx = Index(['a', 'b', 'c', 'f']) ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) ser2 = Series(['a', 'b', 'c', 'f'], index=idx) ser3 = Series([12, 4, 5, 97], index=idx) frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3}) combined = frame1.combine_first(frame2) assert len(combined.columns) == 5 # gh 3016 (same as in update) df = DataFrame([[1., 2., False, True], [4., 5., True, False]], columns=['A', 'B', 'bool1', 'bool2']) other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) result = df.combine_first(other) assert_frame_equal(result, df) df.loc[0, 'A'] = np.nan result = df.combine_first(other) df.loc[0, 'A'] = 45 assert_frame_equal(result, df) # doc example df1 = DataFrame({ 'A': [1., np.nan, 3., 5., np.nan], 'B': [np.nan, 2., 3., np.nan, 6.] }) df2 = DataFrame({ 'A': [5., 2., 4., np.nan, 3., 7.], 'B': [np.nan, np.nan, 3., 4., 6., 8.] }) result = df1.combine_first(df2) expected = DataFrame({ 'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8] }) assert_frame_equal(result, expected) # GH3552, return object dtype with bools df1 = DataFrame([[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]]) df2 = DataFrame([[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2]) result = df1.combine_first(df2)[2] expected = Series([True, True, False], name=2) assert_series_equal(result, expected) # GH 3593, converting datetime64[ns] incorrecly df0 = DataFrame({ "a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] }) df1 = DataFrame({"a": [None, None, None]}) df2 = df1.combine_first(df0) assert_frame_equal(df2, df0) df2 = df0.combine_first(df1) assert_frame_equal(df2, df0) df0 = DataFrame({ "a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] }) df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) df2 = df1.combine_first(df0) result = df0.copy() result.iloc[0, :] = df1.iloc[0, :] assert_frame_equal(df2, result) df2 = df0.combine_first(df1) assert_frame_equal(df2, df0)
def test_partial_setting(self): # GH2578, allow ix and friends to partially set # series s_orig = Series([1, 2, 3]) s = s_orig.copy() s[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) # iloc/iat raise s = s_orig.copy() def f(): s.iloc[3] = 5. self.assertRaises(IndexError, f) def f(): s.iat[3] = 5. self.assertRaises(IndexError, f) # ## frame ## df_orig = DataFrame(np.arange(6).reshape(3, 2), columns=['A', 'B'], dtype='int64') # iloc/iat raise df = df_orig.copy() def f(): df.iloc[4, 2] = 5. self.assertRaises(IndexError, f) def f(): df.iat[4, 2] = 5. self.assertRaises(IndexError, f) # row setting where it exists expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.iloc[1] = df.iloc[2] tm.assert_frame_equal(df, expected) expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.loc[1] = df.loc[2] tm.assert_frame_equal(df, expected) # like 2578, partial setting with dtype preservation expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]})) df = df_orig.copy() df.loc[3] = df.loc[2] tm.assert_frame_equal(df, expected) # single dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]})) df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'B'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])})) df = df_orig.copy() df['B'] = df['B'].astype(np.float64) with catch_warnings(record=True): df.ix[:, 'B'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # single dtype frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'C'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # mixed frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'C'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) with catch_warnings(record=True): # ## panel ## p_orig = Panel(np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], major_axis=pd.date_range('2001/1/12', periods=4), minor_axis=['A', 'B'], dtype='float64') # panel setting via item p_orig = Panel(np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], major_axis=pd.date_range('2001/1/12', periods=4), minor_axis=['A', 'B'], dtype='float64') expected = p_orig.copy() expected['Item3'] = expected['Item1'] p = p_orig.copy() p.loc['Item3'] = p['Item1'] tm.assert_panel_equal(p, expected) # panel with aligned series expected = p_orig.copy() expected = expected.transpose(2, 1, 0) expected['C'] = DataFrame( { 'Item1': [30, 30, 30, 30], 'Item2': [32, 32, 32, 32] }, index=p_orig.major_axis) expected = expected.transpose(2, 1, 0) p = p_orig.copy() p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items) tm.assert_panel_equal(p, expected) # GH 8473 dates = date_range('1/1/2000', periods=8) df_orig = DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) expected = pd.concat( [df_orig, DataFrame({'A': 7}, index=[dates[-1] + 1])]) df = df_orig.copy() df.loc[dates[-1] + 1, 'A'] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 'A'] = 7 tm.assert_frame_equal(df, expected) exp_other = DataFrame({0: 7}, index=[dates[-1] + 1]) expected = pd.concat([df_orig, exp_other], axis=1) df = df_orig.copy() df.loc[dates[-1] + 1, 0] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 0] = 7 tm.assert_frame_equal(df, expected)
def write_plink_or_bolt_file(input_df: pd.DataFrame, path_or_buf: Union[Optional[str], os.PathLike, io.IOBase], binary_column_mapping: Dict[str, Dict[int, Union[int, float]]], missing_value: Union[str, int, float], cast_ints: bool = True) -> Optional[str]: """Writes a PLINK/BOLT formatted file of `input_df` to `path`. This is the complementary function to `load_plink_or_bolt_file`. In particular, the `binary_column_mapping` input is expected to be created by the loading function to ensure that non-DeepNull-predicted columns retain the same values as in the input data. Args: input_df: The DataFrame to write to TSV. path_or_buf: The path to write the TSV to. binary_column_mapping: The mapping from binary column name to the mapping of the binary represenation of that column in `input_df` to the original binary representation of the data. missing_value: The missing value to use when writing out. Typically 'NA' for BOLT or Regenie, and possibly -9 for PLINK. cast_ints: If True, any fields that contain only integer values are written as integers. Returns: The result as a string if `path_or_buf` is None, otherwise None. """ # Sanity check. if list(input_df.columns[:2]) != ['FID', 'IID']: raise ValueError('"FID" and "IID" required to start PLINK/BOLT file: ' f'{input_df.columns}') # Make a copy since we mutate, then transform binary fields to their original # representation. df = input_df.copy() for column, mapping in binary_column_mapping.items(): df[column] = df[column].replace(mapping) if cast_ints: for column in df.columns: values = df[column] mask = ~values.isnull() try: int_values = values[mask].astype(int) except ValueError: # This is a non-numeric field, leave it as-is. continue else: if (values[mask] == int_values).all(): # All non-null values are integers. Convert to the 'Int64' type that # allows nullable integers. This requires nulls to use the pd.NA value # rather than np.nan. df[column] = values.fillna(pd.NA).astype('Int64') return df.to_csv(path_or_buf, sep='\t', index=False, na_rep=str(missing_value))
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Apply the transforms to the dataframe.""" X = X.copy() #We could do some NLP here to get sentiments scores on the captions, for example def __image_size_check(image_loc) -> float: image = mpimg.imread(image_loc) try: (h, w, n) = image.shape except: return np.nan if w < 800: return np.nan else: return (h, w, n) def __difference_from_mean_likes_per_follower(row) -> float: name = row['credits'] ndays = (self.now - row['postdate']).days if ndays < 1: ndays = 1 return (row['nlikes_per_follower'] - \ self.summary[self.summary['credits']==name]['nlikes_per_follower'].values[0])/(self.summary[self.summary['credits']==name]['nlikes_per_follower'].values[0]*ndays) def __difference_from_mean_comments_per_follower(row) -> float: name = row['credits'] ndays = (self.now - row['postdate']).days if ndays < 1: ndays = 1 return (row['ncomments_per_follower'] - \ self.summary[self.summary['credits']==name]['ncomments_per_follower'].values[0])/(self.summary[self.summary['credits']==name]['ncomments_per_follower'].values[0]*ndays) def __categorize_parks(parkid) -> int: return self.names_dir[parkid] def __post_rank(row, likes_weight=config.LIKES_WEIGHT, comments_weight=config.COMMENT_WEIGHT) -> float: return row['mean_nlikes_diff'] * likes_weight + row[ 'mean_ncomments_diff'] * comments_weight #def __previously_posted(floc): # if floc in self.previous_posts: # return np.nan # else: # return 1 X['postdate'] = pd.to_datetime(X['postdate']) X['mean_nlikes_diff'] = X.apply( lambda row: __difference_from_mean_likes_per_follower(row), axis=1) X['mean_ncomments_diff'] = X.apply( lambda row: __difference_from_mean_comments_per_follower(row), axis=1) X['park_id'] = X['credits'].apply(__categorize_parks) X['rank'] = X.apply(lambda row: __post_rank(row), axis=1) X['image_size'] = X['Flocation'].apply(__image_size_check) return X
def test_inplace_return_self(self): # re #1893 data = DataFrame({ 'a': ['foo', 'bar', 'baz', 'qux'], 'b': [0, 0, 1, 1], 'c': [1, 2, 3, 4] }) def _check_f(base, f): result = f(base) assert result is None # -----DataFrame----- # set_index f = lambda x: x.set_index('a', inplace=True) _check_f(data.copy(), f) # reset_index f = lambda x: x.reset_index(inplace=True) _check_f(data.set_index('a'), f) # drop_duplicates f = lambda x: x.drop_duplicates(inplace=True) _check_f(data.copy(), f) # sort f = lambda x: x.sort_values('b', inplace=True) _check_f(data.copy(), f) # sort_index f = lambda x: x.sort_index(inplace=True) _check_f(data.copy(), f) # fillna f = lambda x: x.fillna(0, inplace=True) _check_f(data.copy(), f) # replace f = lambda x: x.replace(1, 0, inplace=True) _check_f(data.copy(), f) # rename f = lambda x: x.rename({1: 'foo'}, inplace=True) _check_f(data.copy(), f) # -----Series----- d = data.copy()['c'] # reset_index f = lambda x: x.reset_index(inplace=True, drop=True) _check_f(data.set_index('a')['c'], f) # fillna f = lambda x: x.fillna(0, inplace=True) _check_f(d.copy(), f) # replace f = lambda x: x.replace(1, 0, inplace=True) _check_f(d.copy(), f) # rename f = lambda x: x.rename({1: 'foo'}, inplace=True) _check_f(d.copy(), f)
def _add_prefixes(self, causality_df: pd.DataFrame) -> pd.DataFrame: causes_causality_df = causality_df.copy() causes_causality_df[self.child_id_col] = causes_causality_df[self.child_id_col].apply(lambda x: 'causes_' + x) causedby_causality_df = causality_df.copy() causedby_causality_df[self.parent_id_col] = causedby_causality_df[self.parent_id_col].apply(lambda x: 'causedby_' + x) return pd.concat([causes_causality_df, causedby_causality_df]).reset_index(drop=True)
def mv_col_handling( data: pd.DataFrame, target: Optional[Union[str, pd.Series, List]] = None, mv_threshold: float = 0.1, corr_thresh_features: float = 0.5, corr_thresh_target: float = 0.3, return_details: bool = False, ) -> pd.DataFrame: """ Converts columns with a high ratio of missing values into binary features and \ eventually drops them based on their correlation with other features and the \ target variable. This function follows a three step process: - 1) Identify features with a high ratio of missing values (above 'mv_threshold'). - 2) Identify high correlations of these features among themselves and with \ other features in the dataset (above 'corr_thresh_features'). - 3) Features with high ratio of missing values and high correlation among each \ other are dropped unless they correlate reasonably well with the target \ variable (above 'corr_thresh_target'). Note: If no target is provided, the process exits after step two and drops columns \ identified up to this point. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame target : Optional[Union[str, pd.Series, List]], optional Specify target for correlation. I.e. label column to generate only the \ correlations between each feature and the label, by default None mv_threshold : float, optional Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \ than mv_threshold are candidates for dropping and undergo further analysis, by \ default 0.1 corr_thresh_features : float, optional Value between 0 <= threshold <= 1. Maximum correlation a previously identified \ features (with a high mv-ratio) is allowed to have with another feature. If \ this threshold is overstepped, the feature undergoes further analysis, by \ default 0.5 corr_thresh_target : float, optional Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \ feature (i.e. feature with a high mv-ratio and high correlation to another \ existing feature) with the target. If this threshold is not met the feature is \ ultimately dropped, by default 0.3 return_details : bool, optional Provdies flexibility to return intermediary results, by default False Returns ------- pd.DataFrame Updated Pandas DataFrame optional: cols_mv: Columns with missing values included in the analysis drop_cols: List of dropped columns """ # Validate Inputs _validate_input_range(mv_threshold, "mv_threshold", 0, 1) _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1) _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1) data = pd.DataFrame(data).copy() data_local = data.copy() mv_ratios = _missing_vals(data_local)["mv_cols_ratio"] cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist() data_local[cols_mv] = (data_local[cols_mv].applymap( lambda x: 1 if not pd.isnull(x) else x).fillna(0)) high_corr_features = [] data_temp = data_local.copy() for col in cols_mv: corrmat = corr_mat(data_temp, colored=False) if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features: high_corr_features.append(col) data_temp = data_temp.drop(columns=[col]) drop_cols = [] if target is None: data = data.drop(columns=high_corr_features) else: corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features] drop_cols = corrs.loc[ abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist() data = data.drop(columns=drop_cols) if return_details: return data, cols_mv, drop_cols return data
def fill_nan_mean(df: pd.DataFrame, columns: List[str]) -> None: df = df.copy() for column in columns: df[column] = df[column].fillna((df[column].mean())) return df
def _filter_feasible_rows( df: pd.DataFrame, optimization_config: OptimizationConfig, status_quo: Optional[Arm], ) -> pd.DataFrame: """Filter out arms that do not satisfy outcome constraints Looks at all arm data collected and removes rows corresponding to arms in which one or more of their associated metrics' 95% confidence interval falls outside of any outcome constraint's bounds (i.e. we are 95% sure the bound is not satisfied). """ if len(optimization_config.outcome_constraints) < 1: return df name = df["metric_name"] # When SEM is NaN we should treat it as if it were 0 sems = not_none(df["sem"].fillna(0)) # Bounds computed for 95% confidence interval on Normal distribution lower_bound = df["mean"] - sems * 1.96 upper_bound = df["mean"] + sems * 1.96 # Only compute relativization if some constraints are relative rel_df = None rel_lower_bound = None rel_upper_bound = None if status_quo is not None and any( oc.relative for oc in optimization_config.outcome_constraints): # relativize_data expects all arms to come from the same trial, we need to # format the data as if it was. to_relativize = df.copy() to_relativize["trial_index"] = 0 rel_df = relativize_data(data=Data(to_relativize), status_quo_name=status_quo.name).df.append( { "arm_name": "status_quo", "metric_name": status_quo.name, "mean": 0, "sem": 0, }, ignore_index=True, ) rel_sems = not_none(rel_df["sem"].fillna(0)) rel_lower_bound = rel_df["mean"] - rel_sems * 1.96 rel_upper_bound = rel_df["mean"] + rel_sems * 1.96 # Nested function from OC -> Mask for consumption in later map/reduce from # [OC] -> Mask. Constraint relativity is handled inside so long as relative bounds # are set in surrounding closure (which will occur in proper experiment setup). def oc_mask(oc: OutcomeConstraint) -> pd.Series: name_match_mask = name == oc.metric.name if oc.relative: if rel_lower_bound is None or rel_upper_bound is None: logger.warning( f"No status quo provided; relative constraint {oc} ignored." ) return pd.Series(True, index=df.index) observed_lower_bound = rel_lower_bound observed_upper_bound = rel_upper_bound else: observed_lower_bound = lower_bound observed_upper_bound = upper_bound # Return True if metrics are different, or whether the confidence # interval is entirely not within the bound if oc.op == ComparisonOp.GEQ: return ~name_match_mask | observed_upper_bound > oc.bound else: return ~name_match_mask | observed_lower_bound < oc.bound mask = reduce( lambda left, right: left & right, map(oc_mask, optimization_config.outcome_constraints), ) bad_arm_names = (df[~mask]["arm_name"].tolist() if rel_df is None else rel_df[~mask]["arm_name"].tolist()) feasible = df.loc[df["arm_name"].apply(lambda x: x not in bad_arm_names)] if feasible.empty: raise ValueError( "No points satisfied all outcome constraints within 95 percent" + "confidence interval") return feasible
class TestRollingTS: # rolling time-series friendly # xref GH13327 def setup_method(self, method): self.regular = DataFrame({ "A": date_range("20130101", periods=5, freq="s"), "B": range(5) }).set_index("A") self.ragged = DataFrame({"B": range(5)}) self.ragged.index = [ Timestamp("20130101 09:00:00"), Timestamp("20130101 09:00:02"), Timestamp("20130101 09:00:03"), Timestamp("20130101 09:00:05"), Timestamp("20130101 09:00:06"), ] def test_doc_string(self): df = DataFrame( {"B": [0, 1, 2, np.nan, 4]}, index=[ Timestamp("20130101 09:00:00"), Timestamp("20130101 09:00:02"), Timestamp("20130101 09:00:03"), Timestamp("20130101 09:00:05"), Timestamp("20130101 09:00:06"), ], ) df df.rolling("2s").sum() def test_valid(self): df = self.regular # not a valid freq with pytest.raises(ValueError): df.rolling(window="foobar") # not a datetimelike index with pytest.raises(ValueError): df.reset_index().rolling(window="foobar") # non-fixed freqs for freq in ["2MS", offsets.MonthBegin(2)]: with pytest.raises(ValueError): df.rolling(window=freq) for freq in ["1D", offsets.Day(2), "2ms"]: df.rolling(window=freq) # non-integer min_periods for minp in [1.0, "foo", np.array([1, 2, 3])]: with pytest.raises(ValueError): df.rolling(window="1D", min_periods=minp) # center is not implemented with pytest.raises(NotImplementedError): df.rolling(window="1D", center=True) def test_on(self): df = self.regular # not a valid column with pytest.raises(ValueError): df.rolling(window="2s", on="foobar") # column is valid df = df.copy() df["C"] = date_range("20130101", periods=len(df)) df.rolling(window="2d", on="C").sum() # invalid columns with pytest.raises(ValueError): df.rolling(window="2d", on="B") # ok even though on non-selected df.rolling(window="2d", on="C").B.sum() def test_monotonic_on(self): # on/index must be monotonic df = DataFrame({ "A": date_range("20130101", periods=5, freq="s"), "B": range(5) }) assert df.A.is_monotonic df.rolling("2s", on="A").sum() df = df.set_index("A") assert df.index.is_monotonic df.rolling("2s").sum() def test_non_monotonic_on(self): # GH 19248 df = DataFrame({ "A": date_range("20130101", periods=5, freq="s"), "B": range(5) }) df = df.set_index("A") non_monotonic_index = df.index.to_list() non_monotonic_index[0] = non_monotonic_index[3] df.index = non_monotonic_index assert not df.index.is_monotonic with pytest.raises(ValueError): df.rolling("2s").sum() df = df.reset_index() with pytest.raises(ValueError): df.rolling("2s", on="A").sum() def test_frame_on(self): df = DataFrame({ "B": range(5), "C": date_range("20130101 09:00:00", periods=5, freq="3s") }) df["A"] = [ Timestamp("20130101 09:00:00"), Timestamp("20130101 09:00:02"), Timestamp("20130101 09:00:03"), Timestamp("20130101 09:00:05"), Timestamp("20130101 09:00:06"), ] # we are doing simulating using 'on' expected = df.set_index("A").rolling("2s").B.sum().reset_index( drop=True) result = df.rolling("2s", on="A").B.sum() tm.assert_series_equal(result, expected) # test as a frame # we should be ignoring the 'on' as an aggregation column # note that the expected is setting, computing, and resetting # so the columns need to be switched compared # to the actual result where they are ordered as in the # original expected = (df.set_index("A").rolling("2s")[[ "B" ]].sum().reset_index()[["B", "A"]]) result = df.rolling("2s", on="A")[["B"]].sum() tm.assert_frame_equal(result, expected) def test_frame_on2(self): # using multiple aggregation columns df = DataFrame( { "A": [0, 1, 2, 3, 4], "B": [0, 1, 2, np.nan, 4], "C": Index([ Timestamp("20130101 09:00:00"), Timestamp("20130101 09:00:02"), Timestamp("20130101 09:00:03"), Timestamp("20130101 09:00:05"), Timestamp("20130101 09:00:06"), ]), }, columns=["A", "C", "B"], ) expected1 = DataFrame( { "A": [0.0, 1, 3, 3, 7], "B": [0, 1, 3, np.nan, 4], "C": df["C"] }, columns=["A", "C", "B"], ) result = df.rolling("2s", on="C").sum() expected = expected1 tm.assert_frame_equal(result, expected) expected = Series([0, 1, 3, np.nan, 4], name="B") result = df.rolling("2s", on="C").B.sum() tm.assert_series_equal(result, expected) expected = expected1[["A", "B", "C"]] result = df.rolling("2s", on="C")[["A", "B", "C"]].sum() tm.assert_frame_equal(result, expected) def test_basic_regular(self): df = self.regular.copy() df.index = date_range("20130101", periods=5, freq="D") expected = df.rolling(window=1, min_periods=1).sum() result = df.rolling(window="1D").sum() tm.assert_frame_equal(result, expected) df.index = date_range("20130101", periods=5, freq="2D") expected = df.rolling(window=1, min_periods=1).sum() result = df.rolling(window="2D", min_periods=1).sum() tm.assert_frame_equal(result, expected) expected = df.rolling(window=1, min_periods=1).sum() result = df.rolling(window="2D", min_periods=1).sum() tm.assert_frame_equal(result, expected) expected = df.rolling(window=1).sum() result = df.rolling(window="2D").sum() tm.assert_frame_equal(result, expected) def test_min_periods(self): # compare for min_periods df = self.regular # these slightly different expected = df.rolling(2, min_periods=1).sum() result = df.rolling("2s").sum() tm.assert_frame_equal(result, expected) expected = df.rolling(2, min_periods=1).sum() result = df.rolling("2s", min_periods=1).sum() tm.assert_frame_equal(result, expected) def test_closed(self): # xref GH13965 df = DataFrame( {"A": [1] * 5}, index=[ Timestamp("20130101 09:00:01"), Timestamp("20130101 09:00:02"), Timestamp("20130101 09:00:03"), Timestamp("20130101 09:00:04"), Timestamp("20130101 09:00:06"), ], ) # closed must be 'right', 'left', 'both', 'neither' with pytest.raises(ValueError): self.regular.rolling(window="2s", closed="blabla") expected = df.copy() expected["A"] = [1.0, 2, 2, 2, 1] result = df.rolling("2s", closed="right").sum() tm.assert_frame_equal(result, expected) # default should be 'right' result = df.rolling("2s").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [1.0, 2, 3, 3, 2] result = df.rolling("2s", closed="both").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [np.nan, 1.0, 2, 2, 1] result = df.rolling("2s", closed="left").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [np.nan, 1.0, 1, 1, np.nan] result = df.rolling("2s", closed="neither").sum() tm.assert_frame_equal(result, expected) def test_ragged_sum(self): df = self.ragged result = df.rolling(window="1s", min_periods=1).sum() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).sum() expected = df.copy() expected["B"] = [0.0, 1, 3, 3, 7] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=2).sum() expected = df.copy() expected["B"] = [np.nan, np.nan, 3, np.nan, 7] tm.assert_frame_equal(result, expected) result = df.rolling(window="3s", min_periods=1).sum() expected = df.copy() expected["B"] = [0.0, 1, 3, 5, 7] tm.assert_frame_equal(result, expected) result = df.rolling(window="3s").sum() expected = df.copy() expected["B"] = [0.0, 1, 3, 5, 7] tm.assert_frame_equal(result, expected) result = df.rolling(window="4s", min_periods=1).sum() expected = df.copy() expected["B"] = [0.0, 1, 3, 6, 9] tm.assert_frame_equal(result, expected) result = df.rolling(window="4s", min_periods=3).sum() expected = df.copy() expected["B"] = [np.nan, np.nan, 3, 6, 9] tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).sum() expected = df.copy() expected["B"] = [0.0, 1, 3, 6, 10] tm.assert_frame_equal(result, expected) def test_ragged_mean(self): df = self.ragged result = df.rolling(window="1s", min_periods=1).mean() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).mean() expected = df.copy() expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_median(self): df = self.ragged result = df.rolling(window="1s", min_periods=1).median() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).median() expected = df.copy() expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_quantile(self): df = self.ragged result = df.rolling(window="1s", min_periods=1).quantile(0.5) expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).quantile(0.5) expected = df.copy() expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_std(self): df = self.ragged result = df.rolling(window="1s", min_periods=1).std(ddof=0) expected = df.copy() expected["B"] = [0.0] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="1s", min_periods=1).std(ddof=1) expected = df.copy() expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="3s", min_periods=1).std(ddof=0) expected = df.copy() expected["B"] = [0.0] + [0.5] * 4 tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).std(ddof=1) expected = df.copy() expected["B"] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] tm.assert_frame_equal(result, expected) def test_ragged_var(self): df = self.ragged result = df.rolling(window="1s", min_periods=1).var(ddof=0) expected = df.copy() expected["B"] = [0.0] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="1s", min_periods=1).var(ddof=1) expected = df.copy() expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="3s", min_periods=1).var(ddof=0) expected = df.copy() expected["B"] = [0.0] + [0.25] * 4 tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).var(ddof=1) expected = df.copy() expected["B"] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.0] tm.assert_frame_equal(result, expected) def test_ragged_skew(self): df = self.ragged result = df.rolling(window="3s", min_periods=1).skew() expected = df.copy() expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).skew() expected = df.copy() expected["B"] = [np.nan] * 2 + [0.0, 0.0, 0.0] tm.assert_frame_equal(result, expected) def test_ragged_kurt(self): df = self.ragged result = df.rolling(window="3s", min_periods=1).kurt() expected = df.copy() expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).kurt() expected = df.copy() expected["B"] = [np.nan] * 4 + [-1.2] tm.assert_frame_equal(result, expected) def test_ragged_count(self): df = self.ragged result = df.rolling(window="1s", min_periods=1).count() expected = df.copy() expected["B"] = [1.0, 1, 1, 1, 1] tm.assert_frame_equal(result, expected) df = self.ragged result = df.rolling(window="1s").count() tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).count() expected = df.copy() expected["B"] = [1.0, 1, 2, 1, 2] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=2).count() expected = df.copy() expected["B"] = [np.nan, np.nan, 2, np.nan, 2] tm.assert_frame_equal(result, expected) def test_regular_min(self): df = DataFrame({ "A": date_range("20130101", periods=5, freq="s"), "B": [0.0, 1, 2, 3, 4] }).set_index("A") result = df.rolling("1s").min() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) df = DataFrame({ "A": date_range("20130101", periods=5, freq="s"), "B": [5, 4, 3, 4, 5] }).set_index("A") tm.assert_frame_equal(result, expected) result = df.rolling("2s").min() expected = df.copy() expected["B"] = [5.0, 4, 3, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling("5s").min() expected = df.copy() expected["B"] = [5.0, 4, 3, 3, 3] tm.assert_frame_equal(result, expected) def test_ragged_min(self): df = self.ragged result = df.rolling(window="1s", min_periods=1).min() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).min() expected = df.copy() expected["B"] = [0.0, 1, 1, 3, 3] tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).min() expected = df.copy() expected["B"] = [0.0, 0, 0, 1, 1] tm.assert_frame_equal(result, expected) def test_perf_min(self): N = 10000 dfp = DataFrame({"B": np.random.randn(N)}, index=date_range("20130101", periods=N, freq="s")) expected = dfp.rolling(2, min_periods=1).min() result = dfp.rolling("2s").min() assert ((result - expected) < 0.01).all().bool() expected = dfp.rolling(200, min_periods=1).min() result = dfp.rolling("200s").min() assert ((result - expected) < 0.01).all().bool() def test_ragged_max(self): df = self.ragged result = df.rolling(window="1s", min_periods=1).max() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).max() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).max() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) def test_ragged_apply(self, raw): df = self.ragged f = lambda x: 1 result = df.rolling(window="1s", min_periods=1).apply(f, raw=raw) expected = df.copy() expected["B"] = 1.0 tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).apply(f, raw=raw) expected = df.copy() expected["B"] = 1.0 tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).apply(f, raw=raw) expected = df.copy() expected["B"] = 1.0 tm.assert_frame_equal(result, expected) def test_all(self): # simple comparison of integer vs time-based windowing df = self.regular * 2 er = df.rolling(window=1) r = df.rolling(window="1s") for f in [ "sum", "mean", "count", "median", "std", "var", "kurt", "skew", "min", "max", ]: result = getattr(r, f)() expected = getattr(er, f)() tm.assert_frame_equal(result, expected) result = r.quantile(0.5) expected = er.quantile(0.5) tm.assert_frame_equal(result, expected) def test_all_apply(self, raw): df = self.regular * 2 er = df.rolling(window=1) r = df.rolling(window="1s") result = r.apply(lambda x: 1, raw=raw) expected = er.apply(lambda x: 1, raw=raw) tm.assert_frame_equal(result, expected) def test_all2(self): # more sophisticated comparison of integer vs. # time-based windowing df = DataFrame({"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="H")) # in-range data dft = df.between_time("09:00", "16:00") r = dft.rolling(window="5H") for f in [ "sum", "mean", "count", "median", "std", "var", "kurt", "skew", "min", "max", ]: result = getattr(r, f)() # we need to roll the days separately # to compare with a time-based roll # finally groupby-apply will return a multi-index # so we need to drop the day def agg_by_day(x): x = x.between_time("09:00", "16:00") return getattr(x.rolling(5, min_periods=1), f)() expected = (df.groupby(df.index.day).apply(agg_by_day).reset_index( level=0, drop=True)) tm.assert_frame_equal(result, expected) def test_groupby_monotonic(self): # GH 15130 # we don't need to validate monotonicity when grouping data = [ ["David", "1/1/2015", 100], ["David", "1/5/2015", 500], ["David", "5/30/2015", 50], ["David", "7/25/2015", 50], ["Ryan", "1/4/2014", 100], ["Ryan", "1/19/2015", 500], ["Ryan", "3/31/2016", 50], ["Joe", "7/1/2015", 100], ["Joe", "9/9/2015", 500], ["Joe", "10/15/2015", 50], ] df = DataFrame(data=data, columns=["name", "date", "amount"]) df["date"] = to_datetime(df["date"]) expected = (df.set_index("date").groupby("name").apply( lambda x: x.rolling("180D")["amount"].sum())) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) def test_non_monotonic(self): # GH 13966 (similar to #15130, closed by #15175) dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s") df = DataFrame({ "A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.concatenate((dates, dates)), "C": np.arange(40), }) result = df.groupby("A").rolling("4s", on="B").C.mean() expected = (df.set_index("B").groupby("A").apply( lambda x: x.rolling("4s")["C"].mean())) tm.assert_series_equal(result, expected) df2 = df.sort_values("B") result = df2.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) def test_rolling_cov_offset(self): # GH16058 idx = date_range("2017-01-01", periods=24, freq="1h") ss = Series(np.arange(len(idx)), index=idx) result = ss.rolling("2h").cov() expected = Series([np.nan] + [0.5] * (len(idx) - 1), index=idx) tm.assert_series_equal(result, expected) expected2 = ss.rolling(2, min_periods=1).cov() tm.assert_series_equal(result, expected2) result = ss.rolling("3h").cov() expected = Series([np.nan, 0.5] + [1.0] * (len(idx) - 2), index=idx) tm.assert_series_equal(result, expected) expected2 = ss.rolling(3, min_periods=1).cov() tm.assert_series_equal(result, expected2) def test_rolling_on_decreasing_index(self): # GH-19248 index = [ Timestamp("20190101 09:00:00"), Timestamp("20190101 09:00:02"), Timestamp("20190101 09:00:03"), Timestamp("20190101 09:00:05"), Timestamp("20190101 09:00:06"), ] df = DataFrame({"column": [3, 4, 4, 2, 1]}, index=reversed(index)) result = df.rolling("2s").min() expected = DataFrame({"column": [3.0, 3.0, 3.0, 2.0, 1.0]}, index=reversed(index)) tm.assert_frame_equal(result, expected) def test_rolling_on_multi_index_level(self): # GH-15584 df = DataFrame( {"column": range(6)}, index=MultiIndex.from_product( [date_range("20190101", periods=3), range(2)], names=["date", "seq"]), ) result = df.rolling("10d", on=df.index.get_level_values("date")).sum() expected = DataFrame({"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index) tm.assert_frame_equal(result, expected)
def initialize_predictor(self, ratings: pd.DataFrame): self.all_movies_AVG = ratings["rating"].mean() self.ratings_data = ratings.copy() self.ratings_data['rating'] -= self.all_movies_AVG self.user_means = self.ratings_data.groupby('user')['rating'].mean() self.movie_means = self.ratings_data.groupby('item')['rating'].mean()
def from_proj(cls, proj_path: str, dataframe: pd.DataFrame, sub_dataframe_name: str = 'root', dataframe_filter_history: dict = None): """ :param proj_path: root directory of the project :param dataframe: Chosen Child DataFrame from the Mesmerize Project :param sub_dataframe_name: Name of the sub DataFrame to load :param dataframe_filter_history: Filter history of the child dataframe """ df = dataframe.copy() logger.info('Collecting image metadata') tqdm().pandas() df[['meta', 'stim_maps']] = df.progress_apply( lambda r: Transmission._load_files(proj_path, r), axis=1) Transmission._load_imginfo.cache_clear() try: logger.info('Collecting curve data') df['_RAW_CURVE'] = df['ROI_State'].progress_apply( lambda r: r['curve_data'][1]) except: raise IndexError( "Curve data missing from one of your samples.\n" "See the progress bar to get the row index of " "the project dataframe where the curve data are missing") try: logger.info('Collecting curve data') df['_SPIKES'] = df['ROI_State'].progress_apply(lambda r: r[ 'spike_data'][1] if (r['spike_data'] is not None) else None) except KeyError: warn( 'spikes or data not found, probably is probably from Mesmerize version < 0.2' ) try: logger.info('Collecting dfof data') df['_DFOF'] = df['ROI_State'].progress_apply(lambda r: r[ 'dfof_data'][1] if (r['dfof_data'] is not None) else None) except KeyError: warn( 'dfof data not found, probably is probably from Mesmerize version < 0.2' ) df.sort_values(by=['SampleID'], inplace=True) df = df.reset_index(drop=True) h = HistoryTrace() df, block_id = h.create_data_block(df) params = { 'sub_dataframe_name': sub_dataframe_name, 'dataframe_filter_history': dataframe_filter_history } h.add_operation(data_block_id=block_id, operation='spawn_transmission', parameters=params) proj_config = get_proj_config(proj_path) try: roi_type_defs = proj_config.options('ROI_DEFS') stim_type_defs = proj_config.options('STIM_DEFS') custom_columns = proj_config.options('CUSTOM_COLUMNS') except: raise ValueError( 'Could not read project configuration when creating Transmission' '\n' + traceback.format_exc()) return cls(df, proj_path=proj_path, history_trace=h, last_output=None, last_unit='time', ROI_DEFS=roi_type_defs, STIM_DEFS=stim_type_defs, CUSTOM_COLUMNS=custom_columns)
def export2loom(ex_mtx: pd.DataFrame, regulons: List[Regulon], cell_annotations: Mapping[str,str], out_fname: str, num_cores=cpu_count()): """ Create a loom file for a single cell experiment to be used in SCope. :param ex_mtx: The expression matrix (n_cells x n_genes). :param regulons: A list of Regulons. :param cell_annotations: A dictionary that maps a cell ID to its corresponding cell type annotation. :param out_fname: The name of the file to create. :param num_cores: The number of cores to use for AUCell regulon enrichment. """ # Information on the general loom file format: http://linnarssonlab.org/loompy/format/index.html # Information on the SCope specific alterations: https://github.com/aertslab/SCope/wiki/Data-Format # TODO: Not mandatory but adding a section "regulonThresholds" to the general metadata would give # TODO: additional information to the SCope tool to preset a threshold on the AUC distribution of a regulon # TODO: across cells and help with binarization, i.e. deciding if the regulon is "on" or "off" in a cell. # Calculate regulon enrichment per cell using AUCell. auc_mtx = aucell(ex_mtx, regulons, num_cores=num_cores) # (n_cells x n_regulons) # Create an embedding based on UMAP (similar to tSNE but faster). umap_embedding_mtx = pd.DataFrame(data=UMAP().fit_transform(auc_mtx), index=ex_mtx.index, columns=['UMAP1', 'UMAP2']) # (n_cells, 2) # Calculate the number of genes per cell. binary_mtx = ex_mtx.copy() binary_mtx[binary_mtx != 0] = 1.0 ngenes = binary_mtx.sum(axis=1).astype(int) # Encode genes in regulons as "binary" membership matrix. genes = np.array(ex_mtx.columns) n_genes = len(genes) n_regulons = len(regulons) data = np.zeros(shape=(n_genes, n_regulons), dtype=int) for idx, regulon in enumerate(regulons): data[:, idx] = np.isin(genes, regulon.genes).astype(int) regulon_assignment = pd.DataFrame(data=data, index=ex_mtx.columns, columns=list(map(attrgetter('name'), regulons))) # Encode cell type clusters. name2idx = dict(map(reversed, enumerate(sorted(set(cell_annotations.values()))))) clusterings = pd.DataFrame(data=ex_mtx.index, index=ex_mtx.index, columns=['Cell Type']).replace(cell_annotations).replace(name2idx) # Create meta-data structure. def create_structure_array(df): # Create a numpy structured array return np.array([tuple(row) for row in df.as_matrix()], dtype=np.dtype(list(zip(df.columns, df.dtypes)))) nomenclatures = set(map(attrgetter('nomenclature'), regulons)) assert len(nomenclatures) == 1 title = os.path.splitext(os.path.basename(out_fname))[0] column_attrs = { "CellID": ex_mtx.index.values.astype('str'), "nGene": ngenes.values, "Embedding": create_structure_array(umap_embedding_mtx), "RegulonsAUC": create_structure_array(auc_mtx), "Clusterings": create_structure_array(clusterings), "ClusterID": clusterings.values } row_attrs = { "Gene": ex_mtx.columns.values.astype('str'), "Regulons": create_structure_array(regulon_assignment), } general_attrs = { "title": title, "MetaData": json.dumps({ "embeddings": [{ "id": 0, "name": "UMAP (default)", }], "annotations": [{ "name": "", "values": [] }], "clusterings": [{ "id": 0, "group": "celltype", "name": "Cell Type", "clusters": [{"id": idx, "description": name} for name, idx in name2idx.items()] }]}), "Genome": next(iter(nomenclatures))} # Create loom file for use with the SCope tool. # The loom file format opted for rows as genes to facilitate growth along the column axis (i.e add more cells) # PySCENIC chose a different orientation because of limitation set by the feather format: selectively reading # information from disk can only be achieved via column selection. For the ranking databases this is of utmost # importance. fh = lp.create(filename=out_fname, matrix=ex_mtx.T.values, row_attrs=row_attrs, col_attrs=column_attrs, file_attrs=general_attrs) fh.close()
def to_sql( df: pd.DataFrame, table_name: str, creds: SqlCreds, sql_type: str = "table", schema: str = "dbo", index: bool = True, if_exists: str = "fail", batch_size: int = None, debug: bool = False, bcp_path: str = None, ): """ Writes the pandas DataFrame to a SQL table or view. Will write all columns to the table or view. If the destination table/view doesn't exist, will create it. Assumes the SQL table/view has the same number, name, and type of columns. To only write parts of the DataFrame, filter it beforehand and pass that to this function. Unlike the pandas counterpart, if the DataFrame has no rows, nothing will happen. Parameters ---------- df : pandas.DataFrame table_name : str Name of SQL table or view, without the schema creds : bcpandas.SqlCreds The credentials used in the SQL database. sql_type : {'table'}, can only be 'table' The type of SQL object of the destination. schema : str, default 'dbo' The SQL schema. index : bool, default True Write DataFrame index as a column. Uses the index name as the column name in the table. if_exists : {'fail', 'replace', 'append'}, default 'fail' How to behave if the table already exists. * fail: Raise a BCPandasValueError. * replace: Drop the table before inserting new values. * append: Insert new values to the existing table. Matches the dataframe columns to the database columns by name. If the database table exists then the dataframe cannot have new columns that aren't in the table, but conversely table columns can be missing from the dataframe. batch_size : int, optional Rows will be written in batches of this size at a time. By default, BCP sets this to 1000. debug : bool, default False If True, will not delete the temporary CSV and format files, and will output their location. bcp_path : str, default None The full path to the BCP utility, useful if it is not in the PATH environment variable """ # validation if df.shape[0] == 0 or df.shape[1] == 0: return assert sql_type == TABLE, "only supporting table, not view, for now" assert if_exists in IF_EXISTS_OPTIONS if df.columns.has_duplicates: raise BCPandasValueError( "Columns with duplicate names detected, SQL requires that column names be unique. " f"Duplicates: {df.columns[df.columns.duplicated(keep=False)]}") # TODO diff way to implement? could be big performance hit with big dataframe if index: df = df.copy(deep=True).reset_index() delim = get_delimiter(df) quotechar = get_quotechar(df) if batch_size is not None: if batch_size == 0: raise BCPandasValueError("Param batch_size can't be 0") if batch_size > df.shape[0]: raise BCPandasValueError( "Param batch_size can't be larger than the number of rows in the DataFrame" ) # save to temp path csv_file_path = get_temp_file() # replace bools with 1 or 0, this is what pandas native does when writing to SQL Server df.replace({ True: 1, False: 0 }).to_csv( path_or_buf=csv_file_path, sep=delim, header=False, index=False, # already set as new col earlier if index=True quoting=csv.QUOTE_MINIMAL, # pandas default quotechar=quotechar, line_terminator=NEWLINE, doublequote=True, escapechar=None, # not needed, as using doublequote ) logger.debug(f"Saved dataframe to temp CSV file at {csv_file_path}") # build format file fmt_file_path = get_temp_file() sql_item_exists = _sql_item_exists(sql_type=sql_type, schema=schema, table_name=table_name, creds=creds) cols_dict = None # for mypy if if_exists == "append": # get dict of column names -> order of column cols_dict = dict( pd.read_sql_query( """ SELECT COLUMN_NAME, ORDINAL_POSITION FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{_schema}' AND TABLE_NAME = '{_tbl}' """.format(_schema=schema, _tbl=table_name), creds.engine, ).values) # check that column names match in db and dataframe exactly if sql_item_exists: # the db cols are always strings, unlike df cols extra_cols = [ str(x) for x in df.columns if str(x) not in cols_dict.keys() ] if extra_cols: raise BCPandasValueError( f"Column(s) detected in the dataframe that are not in the database, " f"cannot have new columns if `if_exists=='append'`, " f"the extra column(s): {extra_cols}") fmt_file_txt = build_format_file(df=df, delimiter=delim, db_cols_order=cols_dict) with open(fmt_file_path, "w") as ff: ff.write(fmt_file_txt) logger.debug(f"Created BCP format file at {fmt_file_path}") try: if if_exists == "fail": if sql_item_exists: raise BCPandasValueError( f"The {sql_type} called {schema}.{table_name} already exists, " f"`if_exists` param was set to `fail`.") else: _create_table(schema=schema, table_name=table_name, creds=creds, df=df, if_exists=if_exists) elif if_exists == "replace": _create_table(schema=schema, table_name=table_name, creds=creds, df=df, if_exists=if_exists) elif if_exists == "append": if not sql_item_exists: _create_table(schema=schema, table_name=table_name, creds=creds, df=df, if_exists=if_exists) # BCP the data in bcp( sql_item=table_name, direction=IN, flat_file=csv_file_path, format_file_path=fmt_file_path, creds=creds, sql_type=sql_type, schema=schema, batch_size=batch_size, bcp_path=bcp_path, ) finally: if not debug: logger.debug(f"Deleting temp CSV and format files") os.remove(csv_file_path) os.remove(fmt_file_path) else: logger.debug( f"`to_sql` DEBUG mode, not deleting the files. CSV file is at " f"{csv_file_path}, format file is at {fmt_file_path}")
def filter(d: pd.DataFrame, ns: SimpleNamespace) -> pd.DataFrame: if not hasattr(ns, "prefiltering"): return d ns = ns.prefiltering strategy = getattr(ns, "strategy", None) data = d.copy() if strategy == "global_threshold": threshold = getattr(ns, "threshold", None) if threshold is not None: if str(threshold).isdigit(): data = PreFilter.filter_ratings_by_threshold(data, threshold) elif threshold == "average": data = PreFilter.filter_ratings_by_global_average(data) else: raise Exception("Threshold value not recognized") else: raise Exception("Threshold option is missing") elif strategy == "user_average": data = PreFilter.filter_ratings_by_user_average(data) elif strategy == "user_k_core": core = getattr(ns, "core", None) if core is not None: if str(core).isdigit(): data = PreFilter.filter_users_by_profile_size(data, core) else: raise Exception("Core option is not a digit") else: raise Exception("Core option is missing") elif strategy == "item_k_core": core = getattr(ns, "core", None) if core is not None: if str(core).isdigit(): data = PreFilter.filter_items_by_popularity(data, core) else: raise Exception("Core option is not a digit") else: raise Exception("Core option is missing") elif strategy == "iterative_k_core": core = getattr(ns, "core", None) if core is not None: if str(core).isdigit(): data = PreFilter.filter_iterative_k_core(data, core) else: raise Exception("Core option is not a digit") else: raise Exception("Core option is missing") elif strategy == "n_rounds_k_core": core = getattr(ns, "core", None) n_rounds = getattr(ns, "rounds", None) if (core is not None) and (n_rounds is not None): if str(core).isdigit() and str(n_rounds).isdigit(): data = PreFilter.filter_rounds_k_core(data, core, n_rounds) else: raise Exception("Core or rounds options are not digits") else: raise Exception("Core or rounds options are missing") elif strategy == "cold_users": threshold = getattr(ns, "threshold", None) if threshold is not None: if str(threshold).isdigit(): data = PreFilter.filter_retain_cold_users(data, threshold) else: raise Exception("Threshold option is not a digit") else: raise Exception("Threshold option is missing") else: raise Exception("Misssing strategy") return data
def fit(self, data: pd.DataFrame): """Create a dataframe with the results of the Granger causality test with the specified statistical test(s). Parameters ---------- data : pd.DataFrame, shape (n_samples, n_time_series), required The dataframe containing the time series. Returns ------- self : object Returns the instance itself. """ shifts = data.copy() x_columns, y_columns = [], [] for i in range(1, self.max_shift + 1): shifts[f"x_shift_{i}"] = data[self.target_col].shift(i) shifts[f"y_shift_{i-1}"] = data[self.x_col].shift(i) x_columns.append(f"x_shift_{i}") y_columns.append(f"y_shift_{i-1}") shifts.drop([self.target_col, self.x_col], axis="columns", inplace=True) shifts = shifts.dropna() data_single = shifts[x_columns].copy() data_joint = shifts[x_columns + y_columns].copy() linreg_single = LinearRegression() linreg_joint = LinearRegression() linreg_single.fit(data_single, data[self.x_col].loc[data_single.index]) linreg_joint.fit(data_joint, data[self.x_col].loc[data_joint.index]) if "likelihood_chi2" in self.statistics or "zero_f" in self.statistics: y_pred_single = linreg_single.predict(data_single) y_pred_joint = linreg_joint.predict(data_joint) else: y_pred_single = None y_pred_joint = None # dof_single = float(data_single.shape[0] - data_single.shape[1]) dof_joint = float(data_joint.shape[0] - data_joint.shape[1]) - 1 linreg_single_residues = linreg_single._residues linreg_joint_residues = linreg_joint._residues self.results_ = [] stat_test_input = { "linreg_single_residues": linreg_single_residues, "linreg_joint_residues": linreg_joint_residues, "dof_joint": dof_joint, "max_shift": self.max_shift, "data_single": data_single, "y_pred_single": y_pred_single, "y_pred_joint": y_pred_joint, "data": data, "x_col": self.x_col, "data_joint": data_joint, "linreg_joint": linreg_joint, } for s in self.statistics: self.results_.append(STAT_TESTS[s](stat_test_input)) return self
def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df df = DataFrame({ "FC": ["a", "b", "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": list(range(6)), "col2": list(range(6, 12)), }) df.iloc[1, 0] = np.nan df2 = df.copy() mask = ~df2.FC.isna() cols = ["col1", "col2"] dft = df2 * 2 dft.iloc[3, 3] = np.nan expected = DataFrame({ "FC": ["a", np.nan, "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": Series([0, 1, 4, 6, 8, 10]), "col2": [12, 7, 16, np.nan, 20, 22], }) # frame on rhs df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) # with an ndarray on rhs # coerces to float64 because values has float64 dtype # GH 14001 expected = DataFrame({ "FC": ["a", np.nan, "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], "col2": [12, 7, 16, np.nan, 20, 22], }) df2 = df.copy() df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) # broadcasting on the rhs is required df = DataFrame( dict( A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[0, 0, 0, 10, 11], D=[3, 4, 5, 6, 7], )) expected = df.copy() mask = expected["A"] == 0 for col in ["A", "B"]: expected.loc[mask, col] = df["D"] df.loc[df["A"] == 0, ["A", "B"]] = df["D"] tm.assert_frame_equal(df, expected)
def preprocess_states(states_df: pd.DataFrame) -> pd.DataFrame: """ This function applies the preprocessing steps necessary to move from the raw observation to a spatial representation. The spatial representation is like this: - plan 0: Tile type (hexlayout) - plan 1: Tile number - plan 2: Robber position - plan 3: Game phase id - plan 4: Development card left - plan 5: Last dice result - plan 6: Starting player id - plan 7: Current player id - plan 8: Current player has played a developement card during its turn 3 type of pieces, 6 way to put it around the hex - plan 9-26: Player 1 pieces - plan 27-44: Player 2 pieces - plan 45-62: Player 3 pieces - plan 63-80: Player 4 pieces see java_utils.parse_player_infos for more information - plan 81-121: Player 1 public info - plan 122-162: Player 2 public info - plan 163-203: Player 3 public info - plan 204-244: Player 4 public info State shape: 245x7x7 """ states_df = states_df.copy() del states_df['touchingnumbers'] del states_df['name'] del states_df['id'] states_df['gameturn'] = states_df['gameturn'].apply(ju.get_replicated_plan) \ .apply(normalize_gameturn) states_df['hexlayout'] = states_df['hexlayout'].apply(ju.parse_layout) \ .apply(ju.mapping_1d_2d) \ .apply(normalize_hexlayout) states_df['numberlayout'] = states_df['numberlayout'].apply(ju.parse_layout) \ .apply(ju.mapping_1d_2d) \ .apply(normalize_numberlayout) states_df['robberhex'] = states_df['robberhex'].apply(ju.get_1d_id_from_hex) \ .apply(ju.get_2d_id) \ .apply(ju.get_one_hot_plan) states_df['piecesonboard'] = states_df['piecesonboard'].apply( ju.parse_pieces) states_df['gamestate'] = states_df['gamestate'].apply(ju.parse_game_phases) states_df['devcardsleft'] = states_df['devcardsleft'].apply( ju.parse_devcardsleft) states_df['diceresult'] = states_df['diceresult'].apply( ju.parse_dice_result) states_df['startingplayer'] = states_df['startingplayer'].apply( ju.parse_starting_player) states_df['currentplayer'] = states_df['currentplayer'].apply( ju.parse_current_player) states_df['playeddevcard'] = states_df['playeddevcard'].apply( ju.get_replicated_plan) states_df['playersresources'] = states_df['playersresources'].apply(ju.parse_player_resources) \ .apply(normalize_playersresources) states_df['players'] = states_df['players'].apply(ju.parse_player_infos) return states_df
def splitFramesAndForecast(frame: pd.DataFrame, options: dict) -> pd.DataFrame: """ Automates forecasting for each subframe defined by the sortColumns options parameter. Will return the forecast using the method with the best MAPE for each subframe. Will also print the MAPEs of each method for each forecast made. Parameters ---------- frame : pd.DataFrame Data needed for forecast options : dict Instructions on how to read 'frame' Returns ------- outputFrame : pd.DataFrame Same as original 'frame' but with all the columns associated with a forecast added. """ #creates a list of frames, each of which will correspond to a different #forecast frame.sort_values(by=params.getParam('sortColumns', options), ascending=True, inplace=True) frames = list(frame.groupby(by=params.getParam('splitColumns', options))) outputFrame = None for frame in frames: frame = frame[1] frame.reset_index(drop=True, inplace=True) method = params.getParam('method', options) #specifies actions if the forecast method is set to "Auto" in #the options dictionary if method == 'Auto': opts = options.copy() opts['method'] = 'ARIMA' arimaFrame = forecastSingleFrame(frame.copy(), opts) arimaMAPE = 1E6 if 'X_MAPE' not in arimaFrame else arimaFrame[ 'X_MAPE'][0] opts = options.copy() opts['method'] = 'Prophet' prophetFrame = forecastSingleFrame(frame.copy(), opts) prophetMAPE = 1E6 if 'X_MAPE' not in prophetFrame else prophetFrame[ 'X_MAPE'][0] opts = options.copy() opts['method'] = 'MLR' mlrFrame = forecastSingleFrame(frame.copy(), opts) mlrMAPE = 1E6 if 'X_MAPE' not in mlrFrame else mlrFrame['X_MAPE'][0] if 'X_FORECAST' in mlrFrame and 'X_FORECAST' in prophetFrame and 'X_FORECAST' in arimaFrame: ensembleFrame = mlrFrame.copy() # we calculate MAPE using original data column targetColumn = params.getParam('targetColumn', options) if (targetColumn.startswith('X_')): targetColumn = targetColumn[2:] # split the data into past/future based on null in target column numHoldoutRows = params.getParam('numHoldoutRows', options) lastNonNullIdx = Forecast().lastNonNullIndex( ensembleFrame[targetColumn]) lastNonNullIdx = lastNonNullIdx - numHoldoutRows if (numHoldoutRows > 0): evalIdx = list( map( lambda x: x > lastNonNullIdx and x <= (lastNonNullIdx + numHoldoutRows), ensembleFrame['X_INDEX'])) else: evalIdx = ensembleFrame['X_INDEX'] <= lastNonNullIdx ensembleFrame['X_FORECAST'] = list( map(lambda x, y, z: median([x, y, z]), mlrFrame['X_FORECAST'], arimaFrame['X_FORECAST'], prophetFrame['X_FORECAST'])) ensembleFrame['X_LPI'] = list( map(lambda x, y, z: median([x, y, z]), mlrFrame['X_LPI'], arimaFrame['X_LPI'], prophetFrame['X_LPI'])) ensembleFrame['X_UPI'] = list( map(lambda x, y, z: median([x, y, z]), mlrFrame['X_UPI'], arimaFrame['X_UPI'], prophetFrame['X_UPI'])) evalFrame = ensembleFrame[evalIdx] try: ensembleMAPE = calcMAPE(evalFrame['X_FORECAST'], evalFrame[targetColumn]) ensembleFrame['X_MAPE'] = ensembleMAPE for index, row in ensembleFrame.iterrows(): ensembleFrame['X_APE'][index] = ( abs(row['X_FORECAST'] - row[targetColumn]) / row[targetColumn] * 100.0) if row[targetColumn] != 0 else None except: # this may be needed if all forecasts frame and MAPE, APE cannot be calculated if (not ('X_MAPE' in ensembleFrame)): ensembleFrame['X_MAPE'] = 1E6 if (not ('X_APE' in ensembleFrame)): ensembleFrame['X_APE'] = 1E6 mapes = [mlrMAPE, arimaMAPE, prophetMAPE, ensembleMAPE] else: mapes = [mlrMAPE, arimaMAPE, prophetMAPE] print("Auto MAPEs (MLR, ARIMA, Prophet, Ensemble): ", mapes) minMAPE = min(mapes) if (mlrMAPE <= minMAPE): frame = mlrFrame frame['X_METHOD'] = 'MLR' elif (prophetMAPE <= minMAPE): frame = prophetFrame frame['X_METHOD'] = 'Prophet' elif (arimaMAPE <= minMAPE): frame = arimaFrame frame['X_METHOD'] = 'ARIMA' else: frame = ensembleFrame frame['X_METHOD'] = 'Ensemble' else: frame = forecastSingleFrame(frame, options.copy()) outputFrame = frame if outputFrame is None else outputFrame.append( frame, ignore_index=True) return outputFrame
def balance(self, df: pd.DataFrame, target: str): ''' The balance function. :param df: pd.DataFrame The pandas Data Frame to apply the balancer. :param target: str The name of the target column. :return: pd.DataFrame A pandas Data Frame ''' # Creating an internal copy of the data frame. self.df = df.copy() self.target = target # Checking if the target string based t algorithm is present in the data frame. if target not in self.df.columns: raise NoSuchColumn(f"{target} isn't a column of passed data frame") # Checking if the target column is a binary one. if len(self.df[target].unique()) != 2: raise NotBinaryData(f"{target} column isn't a binary column") # Getting the column names that are not the target one. self.X_columns = [ column for column in self.df.columns if column != target ] # Getting the class frequencies. classes_frequency = dict(self.df[target].value_counts()) # Searching for the class with the biggest frequency. max_freq = 0 for cls in classes_frequency: if classes_frequency[cls] > max_freq: majority_class = cls max_freq = classes_frequency[cls] # Getting the name of the minority class. minority_class = [ cls for cls in classes_frequency if cls != majority_class ][0] # Getting the total number of minority samples to generate. G = int((classes_frequency[majority_class] - classes_frequency[minority_class]) * self.__beta) # Getting the set of the minority samples. minority_samples = self.df[self.df[target] == minority_class][ self.X_columns].values # Generating the r matrix - the k indexes of the nearest neighbours. r = np.array([]) self.neighbourhood = [] for minority_sample in minority_samples: predicted_indexes = self.__predict_knn(minority_sample) r = np.append( r, len(self.df[(self.df.index.isin(predicted_indexes) & (self.df[self.target] == majority_class))]) / self.__k) self.neighbourhood.append(predicted_indexes) # Normalizing the r array r = r / np.sum(r) # Calculating the amount of synthetic examples to generate per neighbourhood. G = r * G # Generating the synthetic data. self.synthetic_data = [] for i in range(len(G)): for _ in range(floor(G[i])): choices = self.df.iloc[self.neighbourhood[i], :][self.df[ self.target] == minority_class][self.X_columns].values if len(choices) < 2: continue choices = choices[np.random.randint(len(choices), size=2)] s = choices[0] + (choices[1] - choices[0]) * random.uniform( 0, 1) self.synthetic_data.append(s) # Replacing infinity values with minimal and maximal float python values. self.synthetic_data = self.__infinity_check( np.array(self.synthetic_data).astype(float)) # Creating the synthetic data frame self.synthetic_df = pd.DataFrame(np.array(self.synthetic_data), columns=self.X_columns) # Rounding binary columns if needed. if self.__binarize: self.__to_binary() # Adding the target column self.synthetic_df.loc[:, self.target] = minority_class new_df = pd.concat([self.df, self.synthetic_df], axis=0) return new_df
def fill_fields_and_timeseries_from_column( log: BoundLogger, existing_df: pd.DataFrame, new_df: pd.DataFrame, index_fields: List[str], date_field: str, column_to_fill: str, ) -> pd.DataFrame: """ Return a copy of existing_df with column column_to_fill populated from new_df. Values in existing_df are copied to the return value except for column_to_fill of rows with index_fields present in new_df. If the data frames represent timeseries than pass the name of the time column in date_field. This will clear 'column_to_fill' for all times for each index_fields in new_df. This prevents the return value containing timeseries with a blend of values from existing_df and new_df. See examples in dataset_utils_test.py Args: log: a bound structlog logger. existing_df: Existing data frame new_df: Data used to fill existing df columns index_fields: List of columns to use as common index. date_field: the time column name if the data frames represent timeseries, otherwise '' column_to_fill: column to add into existing_df from data_source Returns: Updated DataFrame with requested column filled from data_source data. """ # Here is a nice tutorial on indexing: # https://jakevdp.github.io/PythonDataScienceHandbook/03.05-hierarchical-indexing.html # Copy so this code can work on the data inplace without modifying the inputs. existing_df = existing_df.copy() new_df = new_df.copy() if column_to_fill not in existing_df.columns: existing_df[column_to_fill] = None if date_field: _clear_common_values(log, existing_df, new_df, index_fields, column_to_fill) # From here down treat the date as part of the index label for joining rows of existing_df and new_df index_fields.append(date_field) new_df.set_index(index_fields, inplace=True) if not existing_df.empty: existing_df.set_index(index_fields, inplace=True) common_labels = existing_df.index.intersection(new_df.index) else: # Treat an empty existing_df the same as one that has no rows in common with new_df common_labels = [] if len(common_labels): # existing_df is not empty and contains labels in common with new_df. When date_field is set the date is # included in the compared labels and dates that are not in exsiting_df are appended later. # Sort suggested by 'PerformanceWarning: indexing past lexsort depth may impact performance' # common_labels is a sparse subset of all labels in both DataFrame and the values are looked up # one by one. existing_df.sort_index(inplace=True, sort_remaining=True) new_df.sort_index(inplace=True, sort_remaining=True) # TODO(tombrown): I have a hunch that this is mostly copying NaN values. Check and consider optimizing by # ignoring rows without a real value in column_to_fill. existing_df.loc[common_labels.values, column_to_fill] = new_df.loc[common_labels.values, column_to_fill] diff = new_df.index.difference(common_labels) # If there are no missing fields, simply return existing dataframe (by this point all fields # have been merged in). if not diff.size: return existing_df.reset_index() missing_new_data = new_df.loc[diff, [column_to_fill]] else: # There are no labels in common so all rows of new_df are to be appended to existing_df. missing_new_data = new_df.loc[:, [column_to_fill]] # Revert 'fips', 'state' etc back to regular columns existing_df.reset_index(inplace=True) missing_new_data.reset_index(inplace=True) # Concat the existing data with new rows from new_data, creating a new integer index return pd.concat([existing_df, missing_new_data], ignore_index=True)
def test_compute_predictions_3(self, tmp_path): # Test with an historical predictions delta > 1 # This means that historical predictions are not computed starting from initial index 1-step ahead at time, # but they are computed every $delta time points. ing_data = DataFrame({ "a": pandas.date_range('2000-01-01', periods=30), "b": np.arange(30, 60), "c": np.arange(60, 90) }) ing_data.set_index("a", inplace=True) ing_data = add_freq(ing_data, "D") param_config = { "input_parameters": {}, "model_parameters": { "test_values": 2, "delta_training_percentage": 100, "prediction_lags": 10, "possible_transformations": "none", "models": "fbprophet,mockup", "main_accuracy_estimator": "mae", }, "historical_prediction_parameters": { "initial_index": "2000-01-20", "save_path": os.path.join(tmp_path, "test3.pkl"), "delta": 3 } } timeseries_containers = compute_historical_predictions( ingested_data=ing_data, param_config=param_config) assert len(timeseries_containers) == 2 assert timeseries_containers[0].timeseries_data.columns[0] == "b" assert timeseries_containers[1].timeseries_data.columns[0] == "c" assert len(timeseries_containers[0].models) == 2 assert len(timeseries_containers[1].models) == 2 for s in timeseries_containers: scen_name = s.timeseries_data.columns[0] for model in s.historical_prediction: hist_prediction = s.historical_prediction[model] assert len(hist_prediction) == 10 id = 0 for i in pandas.date_range('2000-01-21', periods=10): assert hist_prediction.index[id] == i id += 1 for endpoint in [ *pandas.date_range('2000-01-20', periods=4, freq="3d") ]: tr = ing_data.copy() fb_tr = tr.loc[:endpoint] fb_tr = fb_tr[[scen_name]] fbmodel = Prophet() fb_tr.reset_index(inplace=True) fb_tr.columns = ['ds', 'y'] with suppress_stdout_stderr(): fbmodel.fit(fb_tr) future_df = pd.DataFrame(index=pd.date_range( freq="1d", start=endpoint + pandas.Timedelta(days=1), periods=3), columns=["yhat"]) future = future_df.reset_index() future.rename(columns={'index': 'ds'}, inplace=True) forecast = fbmodel.predict(future) forecast.set_index('ds', inplace=True) expected_hist_pred = forecast.loc[:, 'yhat'] expected_hist_pred = expected_hist_pred.astype(object) expected_hist_pred.rename(scen_name, inplace=True) if endpoint == pd.Timestamp( '2000-01-29 00:00:00' ): # Last point, remove last 2 points expected_hist_pred = expected_hist_pred.iloc[0:1] computed_hist_pred = s.historical_prediction['fbprophet'].loc[ endpoint + pandas.Timedelta(days=1):endpoint + pandas.Timedelta(days=3), scen_name] assert expected_hist_pred.equals(computed_hist_pred)
AND GL_ACCOUNT_NUMBER NOT IN (\'114000\', \'113000\', \'119800\')\ AND ACCOUNTING_DOCUMENT_TYPE IN (\'ZR\',\'DA\',\'DR\',\'ZM\',\'DG\',\'DZ\',\'RC\',\ \'RD\',\'RV\',\'DS\', \'KN\')\ AND (CLEARING_DOCUMENT_NUMBER IS NOT NULL \ AND CLEARING_DOCUMENT_NUMBER <> \' \' AND CLEARING_DOCUMENT_NUMBER <> \'\')\ AND (REVERSAL_INDICATOR IS NULL OR REVERSAL_INDICATOR = \'\' OR REVERSAL_INDICATOR = \' \') \ ORDER BY "AMOUNT_IN_GROUP_CONSOLIDATED"\ ) as inv \ LEFT JOIN "_SYS_BIC"."HCDW.IT.SHARED/D_CUSTOMER" as cust \ ON inv.CUSTOMER_NUMBER = cust.CUSTOMER_NUMBER \ LEFT JOIN "_SYS_BIC"."HCDW.IT.SHARED/D_ENTERPRISE_DATE" as d on inv.POSTING_DATE = d.full_date_trimmed \ ORDER BY "POSTING_DATE" ') df = DataFrame(cursor.fetchall()) df.columns = [x[0] for x in cursor.description] ##df.to_csv(r'C:\Users\cthieme\OneDrive - Micron Technology, Inc\Test folder\test_ar_data_export.csv', index = False) hana_data_2019 = df.copy() #Pulling 2018 Data from Hana ####################################### connection = pyhdb.connect(host="xx.xxx.com", port=xxx, user="******", password="******") cursor = connection.cursor() cursor.execute('SELECT inv."CUSTOMER_NUMBER", cust."CUSTOMER_NAME_1",\ cust."ACCOUNT_GROUP",\ cust."COUNTRY",\ inv."FISCAL_QUARTER",\ inv."FISCAL_PERIOD",\ d."FISCAL_DAY_OF_QUARTER_NUMBER",\ d."FISCAL_DAY_OF_PERIOD_NUMBER",\ d."WORK_WEEK",\
def test_unstack_nan_index(self): # GH7466 cast = lambda val: "{0:1}".format("" if val != val else val) def verify(df): mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] rows, cols = df.notna().values.nonzero() for i, j in zip(rows, cols): left = sorted(df.iloc[i, j].split(".")) right = mk_list(df.index[i]) + mk_list(df.columns[j]) right = sorted(list(map(cast, right))) assert left == right df = DataFrame({ "jim": ["a", "b", np.nan, "d"], "joe": ["w", "x", "y", "z"], "jolie": ["a.w", "b.x", " .y", "d.z"], }) left = df.set_index(["jim", "joe"]).unstack()["jolie"] right = df.set_index(["joe", "jim"]).unstack()["jolie"].T tm.assert_frame_equal(left, right) for idx in itertools.permutations(df.columns[:2]): mi = df.set_index(list(idx)) for lev in range(2): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == len(df) verify(udf["jolie"]) df = DataFrame({ "1st": ["d"] * 3 + [np.nan] * 5 + ["a"] * 2 + ["c"] * 3 + ["e"] * 2 + ["b"] * 5, "2nd": ["y"] * 2 + ["w"] * 3 + [np.nan] * 3 + ["z"] * 4 + [np.nan] * 3 + ["x"] * 3 + [np.nan] * 2, "3rd": [ 67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59, 50, 62, 59, 76, 52, 14, 53, 60, 51, ], }) df["4th"], df["5th"] = ( df.apply(lambda r: ".".join(map(cast, r)), axis=1), df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1), ) for idx in itertools.permutations(["1st", "2nd", "3rd"]): mi = df.set_index(list(idx)) for lev in range(3): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == 2 * len(df) for col in ["4th", "5th"]: verify(udf[col]) # GH7403 df = pd.DataFrame({ "A": list("aaaabbbb"), "B": range(8), "C": range(8) }) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [ [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7], ] vals = list(map(list, zip(*vals))) idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B") cols = MultiIndex(levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) df = DataFrame({ "A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8) }) df.iloc[2, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]] cols = MultiIndex(levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]) idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) df = pd.DataFrame({ "A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8) }) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]] cols = MultiIndex(levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]) idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) # GH7401 df = pd.DataFrame({ "A": list("aaaaabbbbb"), "B": (date_range("2012-01-01", periods=5).tolist() * 2), "C": np.arange(10), }) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack() vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]]) idx = Index(["a", "b"], name="A") cols = MultiIndex( levels=[["C"], date_range("2012-01-01", periods=5)], codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], names=[None, "B"], ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) # GH4862 vals = [ ["Hg", np.nan, np.nan, 680585148], ["U", 0.0, np.nan, 680585148], ["Pb", 7.07e-06, np.nan, 680585148], ["Sn", 2.3614e-05, 0.0133, 680607017], ["Ag", 0.0, 0.0133, 680607017], ["Hg", -0.00015, 0.0133, 680607017], ] df = DataFrame( vals, columns=["agent", "change", "dosage", "s_id"], index=[17263, 17264, 17265, 17266, 17267, 17268], ) left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack() vals = [ [np.nan, np.nan, 7.07e-06, np.nan, 0.0], [0.0, -0.00015, np.nan, 2.3614e-05, np.nan], ] idx = MultiIndex( levels=[[680585148, 680607017], [0.0133]], codes=[[0, 1], [-1, 0]], names=["s_id", "dosage"], ) cols = MultiIndex( levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]], codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], names=[None, "agent"], ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"]) tm.assert_frame_equal(left.unstack(), right) # GH9497 - multiple unstack with nulls df = DataFrame({ "1st": [1, 2, 1, 2, 1, 2], "2nd": pd.date_range("2014-02-01", periods=6, freq="D"), "jim": 100 + np.arange(6), "joe": (np.random.randn(6) * 10).round(2), }) df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02") df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"]) assert left.notna().values.sum() == 2 * len(df) for col in ["jim", "joe"]: for _, r in df.iterrows(): key = r["1st"], (col, r["2nd"], r["3rd"]) assert r[col] == left.loc[key]
def transform(self, X: pd.DataFrame) -> pd.DataFrame: print("Constructing captions") t0 = time.time() X = X.copy() def __extract_hashtags(string): try: hashtags = [ re.sub(r"(\W+)$", "", j) for j in set( [i for i in string.split() if i.startswith("#")]) ] except: return np.nan if len(hashtags) == 0: return np.nan else: return hashtags def __extract_pcredits(row): string = row['caption'] try: pcredits = [ re.sub(r"(\W+)$", "", j) for j in set( [i for i in string.split() if i.startswith("@")]) ] pcredits.append('@' + row['credits']) except: return np.nan return pcredits def __hashtagQC(hashtags): hashstring = ''.join(hashtags).lower() for word in self.hashtagQClist: if word in hashstring: return np.nan return [e.lower() for e in hashtags if len(e) > 1] def __generate_repost_caption(row): caption = row['caption'] date = row['postdate'] credit = row['credits'] a = re.split("[.!]+", caption)[0] if a[-1] == '?': a[-1] == '!' else: a = a + '!' a = re.sub('[@$""' ']', '', a) post_date = f"{date.month:02d}-{date.day:02d}-{date.year}" comment = f'Here is a segment from the original post, by @{credit} on {post_date}: "{a}"' return comment X['hashtags'] = X['caption'].apply(__extract_hashtags) X['pcredits'] = X.apply(lambda row: __extract_pcredits(row), axis=1) X['postdate'] = pd.to_datetime(X['postdate']) X.dropna(inplace=True) X['hashtags'] = X['hashtags'].apply(__hashtagQC) X['repost_comment'] = X.apply( lambda row: __generate_repost_caption(row), axis=1) X.dropna(inplace=True) X.reset_index(inplace=True, drop=True) t1 = time.time() print(f'Done in {t1-t0} seconds') return X
def get_cubes( p: pd.DataFrame, grid: float = 1.0, ) -> pd.DataFrame: v = correctlyRotateDataFrame(p.copy()) v["count"] = 0 counts = pd.DataFrame(data={ "x": [], "y": [], "z": [], "r": [], "g": [], "b": [], "c": [], }) global global_min global global_maxX global global_minY global global_maxY global_minX = min(v['x']) global_maxX = max(v['x']) global_minY = min(v['y']) global_maxY = max(v['y']) for x, y, z in itertools.product( np.arange(min(v["x"]), max(v["x"]), grid), np.arange(min(v["y"]), max(v["y"]), grid), np.arange(min(v["z"]), max(v["z"]), grid), ): overscan = 0.0 dots = v[(v["x"] >= x - overscan) & (v["x"] < x + grid + overscan) & (v["y"] >= y - overscan) & (v["y"] < y + grid + overscan) & (v["z"] >= z - overscan) & (v["z"] < z + grid + overscan)] count = len(dots) counts = counts.append( { "x": x, "y": y, "z": z, "r": dots["r"].mean(), "g": dots["g"].mean(), "b": dots["b"].mean(), "c": count, }, ignore_index=True, ) # counts += [count] v.loc[(v["x"] >= x) & (v["x"] < x + grid) & (v["y"] >= y) & (v["y"] < y + grid) & (v["z"] >= z) & (v["z"] < z + grid), "count", ] = count # print(f"{x}, {y}: {count}") v.loc[v["count"] > 20, "count"] = 20 v = v[v["count"] > 5] return counts
def test_clean_input_format(df_countries: pd.DataFrame) -> None: df_clean_name = clean_country(df_countries, "messy_country", input_format="name") df_clean_official = clean_country(df_countries, "messy_country", input_format="official") df_clean_alpha2 = clean_country(df_countries, "messy_country", input_format="alpha-2") df_clean_alpha3 = clean_country(df_countries, "messy_country", input_format="alpha-3") df_clean_numeric = clean_country(df_countries, "messy_country", input_format="numeric") df_check_name_and_official = df_countries.copy() df_check_name_and_official["messy_country_clean"] = [ "Canada", "Canada", np.nan, np.nan, "Ireland", "DR Congo", "Congo Republic", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, ] df_check_alpha2 = df_countries.copy() df_check_alpha2["messy_country_clean"] = [ np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, "American Samoa", "Turkey", "Belize", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, ] df_check_alpha3 = df_countries.copy() df_check_alpha3["messy_country_clean"] = [ np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, "Argentina", "Bouvet Island", "New Zealand", np.nan, np.nan, np.nan, ] df_check_numeric = df_countries.copy() df_check_numeric["messy_country_clean"] = [ np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, "Greenland", "Estonia", "Yemen", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, ] assert df_clean_name.equals(df_check_name_and_official) assert df_clean_official.equals(df_check_name_and_official) assert df_clean_alpha2.equals(df_check_alpha2) assert df_clean_alpha3.equals(df_check_alpha3) assert df_clean_numeric.equals(df_check_numeric)
def test_indexing_with_datetime_tz(self): # GH#8260 # support datetime64 with tz idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") dr = date_range("20130110", periods=3) df = DataFrame({"A": idx, "B": dr}) df["C"] = idx df.iloc[1, 1] = pd.NaT df.iloc[1, 2] = pd.NaT # indexing result = df.iloc[1] expected = Series( [ Timestamp("2013-01-02 00:00:00-0500", tz="US/Eastern"), pd.NaT, pd.NaT ], index=list("ABC"), dtype="object", name=1, ) tm.assert_series_equal(result, expected) result = df.loc[1] expected = Series( [ Timestamp("2013-01-02 00:00:00-0500", tz="US/Eastern"), pd.NaT, pd.NaT ], index=list("ABC"), dtype="object", name=1, ) tm.assert_series_equal(result, expected) # indexing - fast_xs df = DataFrame({"a": date_range("2014-01-01", periods=10, tz="UTC")}) result = df.iloc[5] expected = Series([Timestamp("2014-01-06 00:00:00+0000", tz="UTC")], index=["a"], name=5) tm.assert_series_equal(result, expected) result = df.loc[5] tm.assert_series_equal(result, expected) # indexing - boolean result = df[df.a > df.a[3]] expected = df.iloc[4:] tm.assert_frame_equal(result, expected) # indexing - setting an element df = DataFrame( data=pd.to_datetime(["2015-03-30 20:12:32", "2015-03-12 00:11:11"]), columns=["time"], ) df["new_col"] = ["new", "old"] df.time = df.set_index("time").index.tz_localize("UTC") v = df[df.new_col == "new"].set_index("time").index.tz_convert( "US/Pacific") # trying to set a single element on a part of a different timezone # this converts to object df2 = df.copy() df2.loc[df2.new_col == "new", "time"] = v expected = Series([v[0], df.loc[1, "time"]], name="time") tm.assert_series_equal(df2.time, expected) v = df.loc[df.new_col == "new", "time"] + pd.Timedelta("1s") df.loc[df.new_col == "new", "time"] = v tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v)
def test_frame_iloc_setitem_callable(self): # GH#11485 df = DataFrame({ "X": [1, 2, 3, 4], "Y": list("aabb") }, index=list("ABCD")) # return location res = df.copy() res.iloc[lambda x: [1, 3]] = 0 exp = df.copy() exp.iloc[[1, 3]] = 0 tm.assert_frame_equal(res, exp) res = df.copy() res.iloc[lambda x: [1, 3], :] = -1 exp = df.copy() exp.iloc[[1, 3], :] = -1 tm.assert_frame_equal(res, exp) res = df.copy() res.iloc[lambda x: [1, 3], lambda x: 0] = 5 exp = df.copy() exp.iloc[[1, 3], 0] = 5 tm.assert_frame_equal(res, exp) res = df.copy() res.iloc[lambda x: [1, 3], lambda x: [0]] = 25 exp = df.copy() exp.iloc[[1, 3], [0]] = 25 tm.assert_frame_equal(res, exp) # mixture res = df.copy() res.iloc[[1, 3], lambda x: 0] = -3 exp = df.copy() exp.iloc[[1, 3], 0] = -3 tm.assert_frame_equal(res, exp) res = df.copy() res.iloc[[1, 3], lambda x: [0]] = -5 exp = df.copy() exp.iloc[[1, 3], [0]] = -5 tm.assert_frame_equal(res, exp) res = df.copy() res.iloc[lambda x: [1, 3], 0] = 10 exp = df.copy() exp.iloc[[1, 3], 0] = 10 tm.assert_frame_equal(res, exp) res = df.copy() res.iloc[lambda x: [1, 3], [0]] = [-5, -5] exp = df.copy() exp.iloc[[1, 3], [0]] = [-5, -5] tm.assert_frame_equal(res, exp)
def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) s2 = s1.copy() self.assertTrue(s1.equals(s2)) s1[1] = 99 self.assertFalse(s1.equals(s2)) # NaNs compare as equal s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3]) s2 = s1.copy() self.assertTrue(s1.equals(s2)) s2[0] = 9.9 self.assertFalse(s1.equals(s2)) idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) s1 = Series([1, 2, np.nan], index=idx) s2 = s1.copy() self.assertTrue(s1.equals(s2)) # Add object dtype column with nans index = np.random.random(10) df1 = DataFrame(np.random.random(10, ), index=index, columns=['floats']) df1['text'] = 'the sky is so blue. we could use more chocolate.'.split( ) df1['start'] = date_range('2000-1-1', periods=10, freq='T') df1['end'] = date_range('2000-1-1', periods=10, freq='D') df1['diff'] = df1['end'] - df1['start'] df1['bool'] = (np.arange(10) % 3 == 0) df1.ix[::2] = nan df2 = df1.copy() self.assertTrue(df1['text'].equals(df2['text'])) self.assertTrue(df1['start'].equals(df2['start'])) self.assertTrue(df1['end'].equals(df2['end'])) self.assertTrue(df1['diff'].equals(df2['diff'])) self.assertTrue(df1['bool'].equals(df2['bool'])) self.assertTrue(df1.equals(df2)) self.assertFalse(df1.equals(object)) # different dtype different = df1.copy() different['floats'] = different['floats'].astype('float32') self.assertFalse(df1.equals(different)) # different index different_index = -index different = df2.set_index(different_index) self.assertFalse(df1.equals(different)) # different columns different = df2.copy() different.columns = df2.columns[::-1] self.assertFalse(df1.equals(different)) # DatetimeIndex index = pd.date_range('2000-1-1', periods=10, freq='T') df1 = df1.set_index(index) df2 = df1.copy() self.assertTrue(df1.equals(df2)) # MultiIndex df3 = df1.set_index(['text'], append=True) df2 = df1.set_index(['text'], append=True) self.assertTrue(df3.equals(df2)) df2 = df1.set_index(['floats'], append=True) self.assertFalse(df3.equals(df2)) # NaN in index df3 = df1.set_index(['floats'], append=True) df2 = df1.set_index(['floats'], append=True) self.assertTrue(df3.equals(df2))