class GetNumericData(object): def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) self.df['foo'] = 'bar' self.df['bar'] = 'baz' with warnings.catch_warnings(record=True): self.df = self.df.consolidate() def time_frame_get_numeric_data(self): self.df._get_numeric_data()
class GetNumericData(object): goal_time = 0.2 def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) self.df['foo'] = 'bar' self.df['bar'] = 'baz' self.df = self.df.consolidate() def time_frame_get_numeric_data(self): self.df._get_numeric_data()
def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # assignment # GH 3687 arr = np.random.randn(3, 2) idx = lrange(2) df = DataFrame(arr, columns=['A', 'A']) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) idx = date_range('20130101', periods=4, freq='Q-NOV') df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['a', 'a', 'a', 'a']) df.columns = idx expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['foo', 'bar', 'foo', 'hello']) df['string'] = 'bah' expected = DataFrame( [[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], [2, 1, 3, 5, 'bah']], columns=['foo', 'bar', 'foo', 'hello', 'string']) check(df, expected) with assertRaisesRegexp(ValueError, 'Length of value'): df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) # insert same dtype df['foo2'] = 3 expected = DataFrame( [[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3], [2, 1, 3, 5, 'bah', 3]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # set (non-dup) df['foo2'] = 4 expected = DataFrame( [[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4], [2, 1, 3, 5, 'bah', 4]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) df['foo2'] = 3 # delete (non dup) del df['bar'] expected = DataFrame( [[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], [2, 3, 5, 'bah', 3]], columns=['foo', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # try to delete again (its not consolidated) del df['hello'] expected = DataFrame( [[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # consolidate df = df.consolidate() expected = DataFrame( [[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # insert df.insert(2, 'new_col', 5.) expected = DataFrame( [[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], [2, 3, 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'string', 'foo2']) check(df, expected) # insert a dup assertRaisesRegexp(ValueError, 'cannot insert', df.insert, 2, 'new_col', 4.) df.insert(2, 'new_col', 4., allow_duplicates=True) expected = DataFrame( [[1, 1, 4., 5., 'bah', 3], [1, 2, 4., 5., 'bah', 3], [2, 3, 4., 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'new_col', 'string', 'foo2']) check(df, expected) # delete (dup) del df['foo'] expected = DataFrame( [[4., 5., 'bah', 3], [4., 5., 'bah', 3], [4., 5., 'bah', 3]], columns=['new_col', 'new_col', 'string', 'foo2']) assert_frame_equal(df, expected) # dup across dtypes df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]], columns=['foo', 'bar', 'foo', 'hello']) check(df) df['foo2'] = 7. expected = DataFrame( [[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], [2, 1, 3., 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) result = df['foo'] expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]], columns=['foo', 'foo']) check(result, expected) # multiple replacements df['foo'] = 'string' expected = DataFrame( [['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) del df['foo'] expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'hello', 'foo2']) check(df, expected) # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x']) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) self.assertTrue((result == expected).all().all()) # rename, GH 4403 df4 = DataFrame( { 'TClose': [22.02], 'RT': [0.0454], 'TExg': [0.0422] }, index=MultiIndex.from_tuples([(600809, 20130331)], names=['STK_ID', 'RPT_Date'])) df5 = DataFrame( { 'STK_ID': [600809] * 3, 'RPT_Date': [20120930, 20121231, 20130331], 'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')], 'TClose': [38.05, 41.66, 30.01] }, index=MultiIndex.from_tuples([(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=['STK_ID', 'RPT_Date'])) k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True) result = k.rename(columns={ 'TClose_x': 'TClose', 'TClose_y': 'QT_Close' }) str(result) result.dtypes expected = (DataFrame( [[0.0454, 22.02, 0.0422, 20130331, 600809, u('饡驦'), 30.01]], columns=[ 'RT', 'TClose', 'TExg', 'RPT_Date', 'STK_ID', 'STK_Name', 'QT_Close' ]).set_index(['STK_ID', 'RPT_Date'], drop=False)) assert_frame_equal(result, expected) # reindex is invalid! df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) self.assertRaises(ValueError, df.reindex, columns=['bar']) self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo']) # drop df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) result = df.drop(['a'], axis=1) expected = DataFrame([[1], [1], [1]], columns=['bar']) check(result, expected) result = df.drop('a', axis=1) check(result, expected) # describe df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['bar', 'a', 'a'], dtype='float64') result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'A']) for index in [df.index, pd.Index(list('edcba'))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) expected_df = DataFrame.from_items([('A', expected_ser), ('B', this_df['B']), ('A', expected_ser)]) this_df['A'] = index check(this_df, expected_df) # operations for op in ['__add__', '__mul__', '__sub__', '__truediv__']: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) expected.columns = ['A', 'A'] df.columns = ['A', 'A'] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 df = DataFrame(np.random.randn(5, 2), columns=['that', 'that']) expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) df['that'] = 1.0 check(df, expected) df = DataFrame(np.random.rand(5, 2), columns=['that', 'that']) expected = DataFrame(1, index=range(5), columns=['that', 'that']) df['that'] = 1 check(df, expected)
def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # assignment # GH 3687 arr = np.random.randn(3, 2) idx = lrange(2) df = DataFrame(arr, columns=['A', 'A']) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) idx = date_range('20130101', periods=4, freq='Q-NOV') df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['a', 'a', 'a', 'a']) df.columns = idx expected = DataFrame( [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['foo', 'bar', 'foo', 'hello']) df['string'] = 'bah' expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], [2, 1, 3, 5, 'bah']], columns=['foo', 'bar', 'foo', 'hello', 'string']) check(df, expected) with assertRaisesRegexp(ValueError, 'Length of value'): df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) # insert same dtype df['foo2'] = 3 expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3], [2, 1, 3, 5, 'bah', 3]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # set (non-dup) df['foo2'] = 4 expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4], [2, 1, 3, 5, 'bah', 4]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) df['foo2'] = 3 # delete (non dup) del df['bar'] expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], [2, 3, 5, 'bah', 3]], columns=['foo', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # try to delete again (its not consolidated) del df['hello'] expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # consolidate df = df.consolidate() expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # insert df.insert(2, 'new_col', 5.) expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], [2, 3, 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'string', 'foo2']) check(df, expected) # insert a dup assertRaisesRegexp(ValueError, 'cannot insert', df.insert, 2, 'new_col', 4.) df.insert(2, 'new_col', 4., allow_duplicates=True) expected = DataFrame([[1, 1, 4., 5., 'bah', 3], [1, 2, 4., 5., 'bah', 3], [2, 3, 4., 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'new_col', 'string', 'foo2']) check(df, expected) # delete (dup) del df['foo'] expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3], [4., 5., 'bah', 3]], columns=['new_col', 'new_col', 'string', 'foo2']) assert_frame_equal(df, expected) # dup across dtypes df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]], columns=['foo', 'bar', 'foo', 'hello']) check(df) df['foo2'] = 7. expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], [2, 1, 3., 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) result = df['foo'] expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]], columns=['foo', 'foo']) check(result, expected) # multiple replacements df['foo'] = 'string' expected = DataFrame([['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) del df['foo'] expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[ 'bar', 'hello', 'foo2']) check(df, expected) # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x']) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) self.assertTrue((result == expected).all().all()) # rename, GH 4403 df4 = DataFrame( {'TClose': [22.02], 'RT': [0.0454], 'TExg': [0.0422]}, index=MultiIndex.from_tuples([(600809, 20130331)], names=['STK_ID', 'RPT_Date'])) df5 = DataFrame({'STK_ID': [600809] * 3, 'RPT_Date': [20120930, 20121231, 20130331], 'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')], 'TClose': [38.05, 41.66, 30.01]}, index=MultiIndex.from_tuples( [(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=['STK_ID', 'RPT_Date'])) k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True) result = k.rename( columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'}) str(result) result.dtypes expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809, u('饡驦'), 30.01]], columns=['RT', 'TClose', 'TExg', 'RPT_Date', 'STK_ID', 'STK_Name', 'QT_Close']) .set_index(['STK_ID', 'RPT_Date'], drop=False)) assert_frame_equal(result, expected) # reindex is invalid! df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) self.assertRaises(ValueError, df.reindex, columns=['bar']) self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo']) # drop df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) result = df.drop(['a'], axis=1) expected = DataFrame([[1], [1], [1]], columns=['bar']) check(result, expected) result = df.drop('a', axis=1) check(result, expected) # describe df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['bar', 'a', 'a'], dtype='float64') result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'A']) for index in [df.index, pd.Index(list('edcba'))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) expected_df = DataFrame.from_items([('A', expected_ser), ('B', this_df['B']), ('A', expected_ser)]) this_df['A'] = index check(this_df, expected_df) # operations for op in ['__add__', '__mul__', '__sub__', '__truediv__']: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) expected.columns = ['A', 'A'] df.columns = ['A', 'A'] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 df = DataFrame(np.random.randn(5, 2), columns=['that', 'that']) expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) df['that'] = 1.0 check(df, expected) df = DataFrame(np.random.rand(5, 2), columns=['that', 'that']) expected = DataFrame(1, index=range(5), columns=['that', 'that']) df['that'] = 1 check(df, expected)
def test_multilevel_consolidate(self): index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), ('bar', 'one'), ('bar', 'two')]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df['Totals', ''] = df.sum(1) df = df.consolidate()
def test_multilevel_consolidate(self): index = MultiIndex.from_tuples([("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df["Totals", ""] = df.sum(1) df = df.consolidate()