Пример #1
0
class GetNumericData(object):
    def setup(self):
        self.df = DataFrame(np.random.randn(10000, 25))
        self.df['foo'] = 'bar'
        self.df['bar'] = 'baz'
        with warnings.catch_warnings(record=True):
            self.df = self.df.consolidate()

    def time_frame_get_numeric_data(self):
        self.df._get_numeric_data()
Пример #2
0
class GetNumericData(object):

    def setup(self):
        self.df = DataFrame(np.random.randn(10000, 25))
        self.df['foo'] = 'bar'
        self.df['bar'] = 'baz'
        with warnings.catch_warnings(record=True):
            self.df = self.df.consolidate()

    def time_frame_get_numeric_data(self):
        self.df._get_numeric_data()
Пример #3
0
class GetNumericData(object):

    goal_time = 0.2

    def setup(self):
        self.df = DataFrame(np.random.randn(10000, 25))
        self.df['foo'] = 'bar'
        self.df['bar'] = 'baz'
        self.df = self.df.consolidate()

    def time_frame_get_numeric_data(self):
        self.df._get_numeric_data()
Пример #4
0
class GetNumericData(object):

    goal_time = 0.2

    def setup(self):
        self.df = DataFrame(np.random.randn(10000, 25))
        self.df['foo'] = 'bar'
        self.df['bar'] = 'baz'
        self.df = self.df.consolidate()

    def time_frame_get_numeric_data(self):
        self.df._get_numeric_data()
Пример #5
0
    def test_column_dups_operations(self):
        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = lrange(2)
        df = DataFrame(arr, columns=['A', 'A'])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range('20130101', periods=4, freq='Q-NOV')
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['a', 'a', 'a', 'a'])
        df.columns = idx
        expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                             columns=idx)
        check(df, expected)

        # insert
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        df['string'] = 'bah'
        expected = DataFrame(
            [[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], [2, 1, 3, 5, 'bah']],
            columns=['foo', 'bar', 'foo', 'hello', 'string'])
        check(df, expected)
        with assertRaisesRegexp(ValueError, 'Length of value'):
            df.insert(0, 'AnotherColumn', range(len(df.index) - 1))

        # insert same dtype
        df['foo2'] = 3
        expected = DataFrame(
            [[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
             [2, 1, 3, 5, 'bah', 3]],
            columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)

        # set (non-dup)
        df['foo2'] = 4
        expected = DataFrame(
            [[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
             [2, 1, 3, 5, 'bah', 4]],
            columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)
        df['foo2'] = 3

        # delete (non dup)
        del df['bar']
        expected = DataFrame(
            [[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], [2, 3, 5, 'bah', 3]],
            columns=['foo', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)

        # try to delete again (its not consolidated)
        del df['hello']
        expected = DataFrame(
            [[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]],
            columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # consolidate
        df = df.consolidate()
        expected = DataFrame(
            [[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]],
            columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # insert
        df.insert(2, 'new_col', 5.)
        expected = DataFrame(
            [[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], [2, 3, 5., 'bah', 3]],
            columns=['foo', 'foo', 'new_col', 'string', 'foo2'])
        check(df, expected)

        # insert a dup
        assertRaisesRegexp(ValueError, 'cannot insert', df.insert, 2,
                           'new_col', 4.)
        df.insert(2, 'new_col', 4., allow_duplicates=True)
        expected = DataFrame(
            [[1, 1, 4., 5., 'bah', 3], [1, 2, 4., 5., 'bah', 3],
             [2, 3, 4., 5., 'bah', 3]],
            columns=['foo', 'foo', 'new_col', 'new_col', 'string', 'foo2'])
        check(df, expected)

        # delete (dup)
        del df['foo']
        expected = DataFrame(
            [[4., 5., 'bah', 3], [4., 5., 'bah', 3], [4., 5., 'bah', 3]],
            columns=['new_col', 'new_col', 'string', 'foo2'])
        assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        check(df)

        df['foo2'] = 7.
        expected = DataFrame(
            [[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], [2, 1, 3., 5, 7.]],
            columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        result = df['foo']
        expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
                             columns=['foo', 'foo'])
        check(result, expected)

        # multiple replacements
        df['foo'] = 'string'
        expected = DataFrame(
            [['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.],
             ['string', 1, 'string', 5, 7.]],
            columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        del df['foo']
        expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                             columns=['bar', 'hello', 'foo2'])
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        self.assertTrue((result == expected).all().all())

        # rename, GH 4403
        df4 = DataFrame(
            {
                'TClose': [22.02],
                'RT': [0.0454],
                'TExg': [0.0422]
            },
            index=MultiIndex.from_tuples([(600809, 20130331)],
                                         names=['STK_ID', 'RPT_Date']))

        df5 = DataFrame(
            {
                'STK_ID': [600809] * 3,
                'RPT_Date': [20120930, 20121231, 20130331],
                'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
                'TClose': [38.05, 41.66, 30.01]
            },
            index=MultiIndex.from_tuples([(600809, 20120930),
                                          (600809, 20121231),
                                          (600809, 20130331)],
                                         names=['STK_ID', 'RPT_Date']))

        k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
        result = k.rename(columns={
            'TClose_x': 'TClose',
            'TClose_y': 'QT_Close'
        })
        str(result)
        result.dtypes

        expected = (DataFrame(
            [[0.0454, 22.02, 0.0422, 20130331, 600809,
              u('饡驦'), 30.01]],
            columns=[
                'RT', 'TClose', 'TExg', 'RPT_Date', 'STK_ID', 'STK_Name',
                'QT_Close'
            ]).set_index(['STK_ID', 'RPT_Date'], drop=False))
        assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        self.assertRaises(ValueError, df.reindex, columns=['bar'])
        self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo'])

        # drop
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        result = df.drop(['a'], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=['bar'])
        check(result, expected)
        result = df.drop('a', axis=1)
        check(result, expected)

        # describe
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['bar', 'a', 'a'],
                       dtype='float64')
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(np.random.randn(5, 3),
                       index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'A'])
        for index in [df.index, pd.Index(list('edcba'))]:
            this_df = df.copy()
            expected_ser = pd.Series(index.values, index=this_df.index)
            expected_df = DataFrame.from_items([('A', expected_ser),
                                                ('B', this_df['B']),
                                                ('A', expected_ser)])
            this_df['A'] = index
            check(this_df, expected_df)

        # operations
        for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ['A', 'A']
            df.columns = ['A', 'A']
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
        expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])

        df['that'] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
        expected = DataFrame(1, index=range(5), columns=['that', 'that'])

        df['that'] = 1
        check(df, expected)
Пример #6
0
    def test_column_dups_operations(self):

        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = lrange(2)
        df = DataFrame(arr, columns=['A', 'A'])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range('20130101', periods=4, freq='Q-NOV')
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['a', 'a', 'a', 'a'])
        df.columns = idx
        expected = DataFrame(
            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
        check(df, expected)

        # insert
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        df['string'] = 'bah'
        expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
                              [2, 1, 3, 5, 'bah']],
                             columns=['foo', 'bar', 'foo', 'hello', 'string'])
        check(df, expected)
        with assertRaisesRegexp(ValueError, 'Length of value'):
            df.insert(0, 'AnotherColumn', range(len(df.index) - 1))

        # insert same dtype
        df['foo2'] = 3
        expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
                              [2, 1, 3, 5, 'bah', 3]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)

        # set (non-dup)
        df['foo2'] = 4
        expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
                              [2, 1, 3, 5, 'bah', 4]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)
        df['foo2'] = 3

        # delete (non dup)
        del df['bar']
        expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
                              [2, 3, 5, 'bah', 3]],
                             columns=['foo', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)

        # try to delete again (its not consolidated)
        del df['hello']
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # consolidate
        df = df.consolidate()
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # insert
        df.insert(2, 'new_col', 5.)
        expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
                              [2, 3, 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col', 'string',
                                      'foo2'])
        check(df, expected)

        # insert a dup
        assertRaisesRegexp(ValueError, 'cannot insert',
                           df.insert, 2, 'new_col', 4.)
        df.insert(2, 'new_col', 4., allow_duplicates=True)
        expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
                              [1, 2, 4., 5., 'bah', 3],
                              [2, 3, 4., 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col',
                                      'new_col', 'string', 'foo2'])
        check(df, expected)

        # delete (dup)
        del df['foo']
        expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
                              [4., 5., 'bah', 3]],
                             columns=['new_col', 'new_col', 'string', 'foo2'])
        assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        check(df)

        df['foo2'] = 7.
        expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
                              [2, 1, 3., 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        result = df['foo']
        expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
                             columns=['foo', 'foo'])
        check(result, expected)

        # multiple replacements
        df['foo'] = 'string'
        expected = DataFrame([['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        del df['foo']
        expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
                             'bar', 'hello', 'foo2'])
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        self.assertTrue((result == expected).all().all())

        # rename, GH 4403
        df4 = DataFrame(
            {'TClose': [22.02],
             'RT': [0.0454],
             'TExg': [0.0422]},
            index=MultiIndex.from_tuples([(600809, 20130331)],
                                         names=['STK_ID', 'RPT_Date']))

        df5 = DataFrame({'STK_ID': [600809] * 3,
                         'RPT_Date': [20120930, 20121231, 20130331],
                         'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
                         'TClose': [38.05, 41.66, 30.01]},
                        index=MultiIndex.from_tuples(
                            [(600809, 20120930),
                             (600809, 20121231),
                             (600809, 20130331)],
                            names=['STK_ID', 'RPT_Date']))

        k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
        result = k.rename(
            columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
        str(result)
        result.dtypes

        expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
                                u('饡驦'), 30.01]],
                              columns=['RT', 'TClose', 'TExg',
                                       'RPT_Date', 'STK_ID', 'STK_Name',
                                       'QT_Close'])
                    .set_index(['STK_ID', 'RPT_Date'], drop=False))
        assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        self.assertRaises(ValueError, df.reindex, columns=['bar'])
        self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo'])

        # drop
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        result = df.drop(['a'], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=['bar'])
        check(result, expected)
        result = df.drop('a', axis=1)
        check(result, expected)

        # describe
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['bar', 'a', 'a'], dtype='float64')
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'A'])
        for index in [df.index, pd.Index(list('edcba'))]:
            this_df = df.copy()
            expected_ser = pd.Series(index.values, index=this_df.index)
            expected_df = DataFrame.from_items([('A', expected_ser),
                                                ('B', this_df['B']),
                                                ('A', expected_ser)])
            this_df['A'] = index
            check(this_df, expected_df)

        # operations
        for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ['A', 'A']
            df.columns = ['A', 'A']
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
        expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])

        df['that'] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
        expected = DataFrame(1, index=range(5), columns=['that', 'that'])

        df['that'] = 1
        check(df, expected)
Пример #7
0
 def test_multilevel_consolidate(self):
     index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'),
                                     ('bar', 'one'), ('bar', 'two')])
     df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
     df['Totals', ''] = df.sum(1)
     df = df.consolidate()
Пример #8
0
 def test_multilevel_consolidate(self):
     index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'),
                                     ('bar', 'one'), ('bar', 'two')])
     df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
     df['Totals', ''] = df.sum(1)
     df = df.consolidate()
Пример #9
0
 def test_multilevel_consolidate(self):
     index = MultiIndex.from_tuples([("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")])
     df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
     df["Totals", ""] = df.sum(1)
     df = df.consolidate()