예제 #1
0
    def testRechunk(self):
        raw = pd.DataFrame(np.random.rand(10, 10))
        df = from_pandas_df(raw, chunk_size=3)
        df2 = df.rechunk(4).tiles()

        self.assertEqual(df2.shape, (10, 10))
        self.assertEqual(len(df2.chunks), 9)

        self.assertEqual(df2.chunks[0].shape, (4, 4))
        pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(), pd.RangeIndex(4))
        pd.testing.assert_index_equal(df2.chunks[0].columns_value.to_pandas(), pd.RangeIndex(4))
        pd.testing.assert_series_equal(df2.chunks[0].dtypes, raw.dtypes[:4])

        self.assertEqual(df2.chunks[2].shape, (4, 2))
        pd.testing.assert_index_equal(df2.chunks[2].index_value.to_pandas(), pd.RangeIndex(4))
        pd.testing.assert_index_equal(df2.chunks[2].columns_value.to_pandas(), pd.RangeIndex(8, 10))
        pd.testing.assert_series_equal(df2.chunks[2].dtypes, raw.dtypes[-2:])

        self.assertEqual(df2.chunks[-1].shape, (2, 2))
        pd.testing.assert_index_equal(df2.chunks[-1].index_value.to_pandas(), pd.RangeIndex(8, 10))
        pd.testing.assert_index_equal(df2.chunks[-1].columns_value.to_pandas(), pd.RangeIndex(8, 10))
        pd.testing.assert_series_equal(df2.chunks[-1].dtypes, raw.dtypes[-2:])

        for c in df2.chunks:
            self.assertEqual(c.shape[1], len(c.dtypes))
            self.assertEqual(len(c.columns_value.to_pandas()), len(c.dtypes))

        columns = [np.random.bytes(10) for _ in range(10)]
        index = np.random.randint(-100, 100, size=(4,))
        raw = pd.DataFrame(np.random.rand(4, 10), index=index, columns=columns)
        df = from_pandas_df(raw, chunk_size=3)
        df2 = df.rechunk(6).tiles()

        self.assertEqual(df2.shape, (4, 10))
        self.assertEqual(len(df2.chunks), 2)

        self.assertEqual(df2.chunks[0].shape, (4, 6))
        pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(), df.index_value.to_pandas())
        pd.testing.assert_index_equal(df2.chunks[0].columns_value.to_pandas(), pd.Index(columns[:6]))
        pd.testing.assert_series_equal(df2.chunks[0].dtypes, raw.dtypes[:6])

        self.assertEqual(df2.chunks[1].shape, (4, 4))
        pd.testing.assert_index_equal(df2.chunks[1].index_value.to_pandas(), df.index_value.to_pandas())
        pd.testing.assert_index_equal(df2.chunks[1].columns_value.to_pandas(), pd.Index(columns[6:]))
        pd.testing.assert_series_equal(df2.chunks[1].dtypes, raw.dtypes[-4:])

        for c in df2.chunks:
            self.assertEqual(c.shape[1], len(c.dtypes))
            self.assertEqual(len(c.columns_value.to_pandas()), len(c.dtypes))

        # test Series rechunk
        series = from_pandas_series(pd.Series(np.random.rand(10,)), chunk_size=3)
        series2 = series.rechunk(4).tiles()

        self.assertEqual(series2.shape, (10,))
        self.assertEqual(len(series2.chunks), 3)
        pd.testing.assert_index_equal(series2.index_value.to_pandas(), pd.RangeIndex(10))

        self.assertEqual(series2.chunk_shape, (3,))
        self.assertEqual(series2.nsplits, ((4, 4, 2), ))
        self.assertEqual(series2.chunks[0].shape, (4,))
        pd.testing.assert_index_equal(series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(4))
        self.assertEqual(series2.chunks[1].shape, (4,))
        pd.testing.assert_index_equal(series2.chunks[1].index_value.to_pandas(), pd.RangeIndex(4, 8))
        self.assertEqual(series2.chunks[2].shape, (2,))
        pd.testing.assert_index_equal(series2.chunks[2].index_value.to_pandas(), pd.RangeIndex(8, 10))

        series2 = series.rechunk(1).tiles()

        self.assertEqual(series2.shape, (10,))
        self.assertEqual(len(series2.chunks), 10)
        pd.testing.assert_index_equal(series2.index_value.to_pandas(), pd.RangeIndex(10))

        self.assertEqual(series2.chunk_shape, (10,))
        self.assertEqual(series2.nsplits, ((1,) * 10, ))
        self.assertEqual(series2.chunks[0].shape, (1,))
        pd.testing.assert_index_equal(series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(1))

        # no need to rechunk
        series2 = series.rechunk(3).tiles()
        series = get_tiled(series)
        self.assertEqual(series2.chunk_shape, series.chunk_shape)
        self.assertEqual(series2.nsplits, series.nsplits)

        # test rechunk on DataFrame has known shape, but chunk's shape is unknown
        data = pd.DataFrame({0: [1, 2], 1: [3, 4], 'a': [5, 6]})
        df = from_pandas_df(data)
        df = df[df[0] < 3]
        with self.assertRaises(TilesError):
            df.tiles().rechunk((np.nan, 3)).tiles()
예제 #2
0
    def testConcat(self):
        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

        mdf1 = from_pandas(df1, chunk_size=4)
        mdf2 = from_pandas(df2, chunk_size=4)
        r = concat([mdf1, mdf2], axis='index')

        self.assertEqual(r.shape, (20, 4))
        pd.testing.assert_series_equal(r.dtypes, df1.dtypes)

        tiled = r.tiles()
        self.assertEqual(tiled.nsplits, ((4, 4, 2, 4, 4, 2), (4,)))
        for i, c in enumerate(tiled.chunks):
            self.assertEqual(c.index, (i, 0))

        df3 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'),
                           index=pd.RangeIndex(10, 20))

        mdf3 = from_pandas(df3, chunk_size=4)
        r = concat([mdf1, mdf3], axis='index')

        self.assertEqual(r.shape, (20, 4))
        pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
        pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))

        df4 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'),
                           index=np.random.permutation(np.arange(10)))

        mdf4 = from_pandas(df4, chunk_size=4)
        r = concat([mdf1, mdf4], axis='index')

        self.assertEqual(r.shape, (20, 4))
        pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
        pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64))

        r = concat([mdf4, mdf1], axis='index')

        self.assertEqual(r.shape, (20, 4))
        pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
        pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64))

        r = concat([mdf4, mdf4], axis='index')

        self.assertEqual(r.shape, (20, 4))
        pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
        pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64))

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=4)
        r = concat([mdf1, mdf2], axis='columns')

        self.assertEqual(r.shape, (10, 8))
        expected_dtypes = pd.concat([df1, df2], axis='columns').dtypes
        pd.testing.assert_series_equal(r.dtypes, expected_dtypes)

        tiled = r.tiles()
        self.assertEqual(tiled.nsplits, ((3, 3, 3, 1), (3, 1, 4)))
        for i, c in enumerate(tiled.chunks):
            index = (i // 3, i % 3)
            self.assertEqual(c.index, index)

        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC'))
        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)
        r = concat([mdf1, mdf2], join='inner')

        self.assertEqual(r.shape, (20, 3))
        tiled = r.tiles()
        self.assertEqual(tiled.nsplits, ((3, 3, 3, 1, 3, 3, 3, 1), (3, )))
예제 #3
0
class TestSeriesConstructors(TestData):
    def test_invalid_dtype(self):
        # GH15520
        msg = 'not understood'
        invalid_list = [pd.Timestamp, 'pd.Timestamp', list]
        for dtype in invalid_list:
            with tm.assert_raises_regex(TypeError, msg):
                Series([], name='time', dtype=dtype)

    def test_scalar_conversion(self):

        # Pass in scalar is disabled
        scalar = Series(0.5)
        assert not isinstance(scalar, float)

        # Coercion
        assert float(Series([1.])) == 1.0
        assert int(Series([1.])) == 1
        assert long(Series([1.])) == 1

    def test_constructor(self):
        assert self.ts.index.is_all_dates

        # Pass in Series
        derived = Series(self.ts)
        assert derived.index.is_all_dates

        assert tm.equalContents(derived.index, self.ts.index)
        # Ensure new index is not created
        assert id(self.ts.index) == id(derived.index)

        # Mixed type Series
        mixed = Series(['hello', np.NaN], index=[0, 1])
        assert mixed.dtype == np.object_
        assert mixed[1] is np.NaN

        assert not self.empty.index.is_all_dates
        assert not Series({}).index.is_all_dates
        pytest.raises(Exception,
                      Series,
                      np.random.randn(3, 3),
                      index=np.arange(3))

        mixed.name = 'Series'
        rs = Series(mixed).name
        xp = 'Series'
        assert rs == xp

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        pytest.raises(NotImplementedError, Series, m)

    @pytest.mark.parametrize('input_class', [list, dict, OrderedDict])
    def test_constructor_empty(self, input_class):
        empty = Series()
        empty2 = Series(input_class())

        # these are Index() and RangeIndex() which don't compare type equal
        # but are just .equals
        assert_series_equal(empty, empty2, check_index_type=False)

        # With explicit dtype:
        empty = Series(dtype='float64')
        empty2 = Series(input_class(), dtype='float64')
        assert_series_equal(empty, empty2, check_index_type=False)

        # GH 18515 : with dtype=category:
        empty = Series(dtype='category')
        empty2 = Series(input_class(), dtype='category')
        assert_series_equal(empty, empty2, check_index_type=False)

        if input_class is not list:
            # With index:
            empty = Series(index=lrange(10))
            empty2 = Series(input_class(), index=lrange(10))
            assert_series_equal(empty, empty2)

            # With index and dtype float64:
            empty = Series(np.nan, index=lrange(10))
            empty2 = Series(input_class(), index=lrange(10), dtype='float64')
            assert_series_equal(empty, empty2)

            # GH 19853 : with empty string, index and dtype str
            empty = Series('', dtype=str, index=range(3))
            empty2 = Series('', index=range(3))
            assert_series_equal(empty, empty2)

    @pytest.mark.parametrize('input_arg', [np.nan, float('nan')])
    def test_constructor_nan(self, input_arg):
        empty = Series(dtype='float64', index=lrange(10))
        empty2 = Series(input_arg, index=lrange(10))

        assert_series_equal(empty, empty2, check_index_type=False)

    @pytest.mark.parametrize('dtype', [
        'f8',
        'i8',
        'M8[ns]',
        'm8[ns]',
        'category',
        'object',
        'datetime64[ns, UTC]',
    ])
    @pytest.mark.parametrize('index', [None, pd.Index([])])
    def test_constructor_dtype_only(self, dtype, index):
        # GH-20865
        result = pd.Series(dtype=dtype, index=index)
        assert result.dtype == dtype
        assert len(result) == 0

    def test_constructor_no_data_index_order(self):
        result = pd.Series(index=['b', 'a', 'c'])
        assert result.index.tolist() == ['b', 'a', 'c']

    def test_constructor_series(self):
        index1 = ['d', 'b', 'a', 'c']
        index2 = sorted(index1)
        s1 = Series([4, 7, -5, 3], index=index1)
        s2 = Series(s1, index=index2)

        assert_series_equal(s2, s1.sort_index())

    def test_constructor_iterator(self):

        expected = Series(list(range(10)), dtype='int64')
        result = Series(range(10), dtype='int64')
        assert_series_equal(result, expected)

    def test_constructor_list_like(self):

        # make sure that we are coercing different
        # list-likes to standard dtypes and not
        # platform specific
        expected = Series([1, 2, 3], dtype='int64')
        for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype='int64')]:
            result = Series(obj, index=[0, 1, 2])
            assert_series_equal(result, expected)

    @pytest.mark.parametrize('input_vals', [
        ([1, 2]),
        ([1.0, 2.0, np.nan]),
        (['1', '2']),
        (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
        (list(pd.date_range('1/1/2011', periods=2, freq='H',
                            tz='US/Eastern'))),
        ([pd.Interval(left=0, right=5)]),
    ])
    def test_constructor_list_str(self, input_vals):
        # GH 16605
        # Ensure that data elements from a list are converted to strings
        # when dtype is str, 'str', or 'U'

        for dtype in ['str', str, 'U']:
            result = Series(input_vals, dtype=dtype)
            expected = Series(input_vals).astype(dtype)
            assert_series_equal(result, expected)

    def test_constructor_generator(self):
        gen = (i for i in range(10))

        result = Series(gen)
        exp = Series(lrange(10))
        assert_series_equal(result, exp)

        gen = (i for i in range(10))
        result = Series(gen, index=lrange(10, 20))
        exp.index = lrange(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_map(self):
        # GH8909
        m = map(lambda x: x, range(10))

        result = Series(m)
        exp = Series(lrange(10))
        assert_series_equal(result, exp)

        m = map(lambda x: x, range(10))
        result = Series(m, index=lrange(10, 20))
        exp.index = lrange(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_categorical(self):
        cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'],
                             fastpath=True)
        res = Series(cat)
        tm.assert_categorical_equal(res.values, cat)

        # GH12574
        pytest.raises(ValueError,
                      lambda: Series(pd.Categorical([1, 2, 3]), dtype='int64'))
        cat = Series(pd.Categorical([1, 2, 3]), dtype='category')
        assert is_categorical_dtype(cat)
        assert is_categorical_dtype(cat.dtype)
        s = Series([1, 2, 3], dtype='category')
        assert is_categorical_dtype(s)
        assert is_categorical_dtype(s.dtype)

    def test_constructor_categorical_with_coercion(self):
        factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        # test basic creation / coercion of categoricals
        s = Series(factor, name='A')
        assert s.dtype == 'category'
        assert len(s) == len(factor)
        str(s.values)
        str(s)

        # in a frame
        df = DataFrame({'A': factor})
        result = df['A']
        tm.assert_series_equal(result, s)
        result = df.iloc[:, 0]
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        df = DataFrame({'A': s})
        result = df['A']
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        # multiples
        df = DataFrame({'A': s, 'B': s, 'C': 1})
        result1 = df['A']
        result2 = df['B']
        tm.assert_series_equal(result1, s)
        tm.assert_series_equal(result2, s, check_names=False)
        assert result2.name == 'B'
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        # GH8623
        x = DataFrame(
            [[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']],
            columns=['person_id', 'person_name'])
        x['person_name'] = Categorical(
            x.person_name)  # doing this breaks transform

        expected = x.iloc[0].person_name
        result = x.person_name.iloc[0]
        assert result == expected

        result = x.person_name[0]
        assert result == expected

        result = x.person_name.loc[0]
        assert result == expected

    def test_constructor_categorical_dtype(self):
        result = pd.Series(['a', 'b'],
                           dtype=CategoricalDtype(['a', 'b', 'c'],
                                                  ordered=True))
        assert is_categorical_dtype(result) is True
        tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c']))
        assert result.cat.ordered

        result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a']))
        assert is_categorical_dtype(result)
        tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a']))
        assert result.cat.ordered is False

        # GH 19565 - Check broadcasting of scalar with Categorical dtype
        result = Series('a',
                        index=[0, 1],
                        dtype=CategoricalDtype(['a', 'b'], ordered=True))
        expected = Series(['a', 'a'],
                          index=[0, 1],
                          dtype=CategoricalDtype(['a', 'b'], ordered=True))
        tm.assert_series_equal(result, expected, check_categorical=True)

    def test_categorical_sideeffects_free(self):
        # Passing a categorical to a Series and then changing values in either
        # the series or the categorical should not change the values in the
        # other one, IF you specify copy!
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat, copy=True)
        assert s.cat is not cat
        s.cat.categories = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # setting
        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # however, copy is False by default
        # so this WILL change values
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat)
        assert s.values is cat
        s.cat.categories = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s)

        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s2)

    def test_unordered_compare_equal(self):
        left = pd.Series(['a', 'b', 'c'], dtype=CategoricalDtype(['a', 'b']))
        right = pd.Series(
            pd.Categorical(['a', 'b', np.nan], categories=['a', 'b']))
        tm.assert_series_equal(left, right)

    def test_constructor_maskedarray(self):
        data = ma.masked_all((3, ), dtype=float)
        result = Series(data)
        expected = Series([nan, nan, nan])
        assert_series_equal(result, expected)

        data[0] = 0.0
        data[2] = 2.0
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([0.0, nan, 2.0], index=index)
        assert_series_equal(result, expected)

        data[1] = 1.0
        result = Series(data, index=index)
        expected = Series([0.0, 1.0, 2.0], index=index)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=int)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=float)
        assert_series_equal(result, expected)

        data[0] = 0
        data[2] = 2
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([0, nan, 2], index=index, dtype=float)
        assert_series_equal(result, expected)

        data[1] = 1
        result = Series(data, index=index)
        expected = Series([0, 1, 2], index=index, dtype=int)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=bool)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=object)
        assert_series_equal(result, expected)

        data[0] = True
        data[2] = False
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([True, nan, False], index=index, dtype=object)
        assert_series_equal(result, expected)

        data[1] = True
        result = Series(data, index=index)
        expected = Series([True, True, False], index=index, dtype=bool)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype='M8[ns]')
        result = Series(data)
        expected = Series([iNaT, iNaT, iNaT], dtype='M8[ns]')
        assert_series_equal(result, expected)

        data[0] = datetime(2001, 1, 1)
        data[2] = datetime(2001, 1, 3)
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([datetime(2001, 1, 1), iNaT,
                           datetime(2001, 1, 3)],
                          index=index,
                          dtype='M8[ns]')
        assert_series_equal(result, expected)

        data[1] = datetime(2001, 1, 2)
        result = Series(data, index=index)
        expected = Series(
            [datetime(2001, 1, 1),
             datetime(2001, 1, 2),
             datetime(2001, 1, 3)],
            index=index,
            dtype='M8[ns]')
        assert_series_equal(result, expected)

    def test_series_ctor_plus_datetimeindex(self):
        rng = date_range('20090415', '20090519', freq='B')
        data = {k: 1 for k in rng}

        result = Series(data, index=rng)
        assert result.index is rng

    def test_constructor_default_index(self):
        s = Series([0, 1, 2])
        tm.assert_index_equal(s.index, pd.Index(np.arange(3)))

    @pytest.mark.parametrize('input', [[1, 2, 3], (1, 2, 3),
                                       list(range(3)),
                                       pd.Categorical(['a', 'b', 'a']),
                                       (i for i in range(3)),
                                       map(lambda x: x, range(3))])
    def test_constructor_index_mismatch(self, input):
        # GH 19342
        # test that construction of a Series with an index of different length
        # raises an error
        msg = 'Length of passed values is 3, index implies 4'
        with pytest.raises(ValueError, message=msg):
            Series(input, index=np.arange(4))

    def test_constructor_numpy_scalar(self):
        # GH 19342
        # construction with a numpy scalar
        # should not raise
        result = Series(np.array(100), index=np.arange(4), dtype='int64')
        expected = Series(100, index=np.arange(4), dtype='int64')
        tm.assert_series_equal(result, expected)

    def test_constructor_broadcast_list(self):
        # GH 19342
        # construction with single-element container and index
        # should raise
        pytest.raises(ValueError, Series, ['foo'], index=['a', 'b', 'c'])

    def test_constructor_corner(self):
        df = tm.makeTimeDataFrame()
        objs = [df, df]
        s = Series(objs, index=[0, 1])
        assert isinstance(s, Series)

    def test_constructor_sanitize(self):
        s = Series(np.array([1., 1., 8.]), dtype='i8')
        assert s.dtype == np.dtype('i8')

        s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8')
        assert s.dtype == np.dtype('f8')

    def test_constructor_copy(self):
        # GH15125
        # test dtype parameter has no side effects on copy=True
        for data in [[1.], np.array([1.])]:
            x = Series(data)
            y = pd.Series(x, copy=True, dtype=float)

            # copy=True maintains original data in Series
            tm.assert_series_equal(x, y)

            # changes to origin of copy does not affect the copy
            x[0] = 2.
            assert not x.equals(y)
            assert x[0] == 2.
            assert y[0] == 1.

    @pytest.mark.parametrize("index", [
        pd.date_range('20170101', periods=3, tz='US/Eastern'),
        pd.date_range('20170101', periods=3),
        pd.timedelta_range('1 day', periods=3),
        pd.period_range('2012Q1', periods=3, freq='Q'),
        pd.Index(list('abc')),
        pd.Int64Index([1, 2, 3]),
        pd.RangeIndex(0, 3)
    ],
                             ids=lambda x: type(x).__name__)
    def test_constructor_limit_copies(self, index):
        # GH 17449
        # limit copies of input
        s = pd.Series(index)

        # we make 1 copy; this is just a smoke test here
        assert s._data.blocks[0].values is not index

    def test_constructor_pass_none(self):
        s = Series(None, index=lrange(5))
        assert s.dtype == np.float64

        s = Series(None, index=lrange(5), dtype=object)
        assert s.dtype == np.object_

        # GH 7431
        # inference on the index
        s = Series(index=np.array([None]))
        expected = Series(index=Index([None]))
        assert_series_equal(s, expected)

    def test_constructor_pass_nan_nat(self):
        # GH 13467
        exp = Series([np.nan, np.nan], dtype=np.float64)
        assert exp.dtype == np.float64
        tm.assert_series_equal(Series([np.nan, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp)

        exp = Series([pd.NaT, pd.NaT])
        assert exp.dtype == 'datetime64[ns]'
        tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp)

        tm.assert_series_equal(Series([pd.NaT, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp)

        tm.assert_series_equal(Series([np.nan, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)

    def test_constructor_cast(self):
        pytest.raises(ValueError, Series, ['a', 'b', 'c'], dtype=float)

    def test_constructor_dtype_nocast(self):
        # 1572
        s = Series([1, 2, 3])

        s2 = Series(s, dtype=np.int64)

        s2[1] = 5
        assert s[1] == 5

    def test_constructor_datelike_coercion(self):

        # GH 9477
        # incorrectly inferring on dateimelike looking when object dtype is
        # specified
        s = Series([Timestamp('20130101'), 'NOV'], dtype=object)
        assert s.iloc[0] == Timestamp('20130101')
        assert s.iloc[1] == 'NOV'
        assert s.dtype == object

        # the dtype was being reset on the slicing and re-inferred to datetime
        # even thought the blocks are mixed
        belly = '216 3T19'.split()
        wing1 = '2T15 4H19'.split()
        wing2 = '416 4T20'.split()
        mat = pd.to_datetime('2016-01-22 2019-09-07'.split())
        df = pd.DataFrame({
            'wing1': wing1,
            'wing2': wing2,
            'mat': mat
        },
                          index=belly)

        result = df.loc['3T19']
        assert result.dtype == object
        result = df.loc['216']
        assert result.dtype == object

    def test_constructor_datetimes_with_nulls(self):
        # gh-15869
        for arr in [
                np.array([None, None, None, None,
                          datetime.now(), None]),
                np.array([None, None, datetime.now(), None])
        ]:
            result = Series(arr)
            assert result.dtype == 'M8[ns]'

    def test_constructor_dtype_datetime64(self):

        s = Series(iNaT, dtype='M8[ns]', index=lrange(5))
        assert isna(s).all()

        # in theory this should be all nulls, but since
        # we are not specifying a dtype is ambiguous
        s = Series(iNaT, index=lrange(5))
        assert not isna(s).all()

        s = Series(nan, dtype='M8[ns]', index=lrange(5))
        assert isna(s).all()

        s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]')
        assert isna(s[1])
        assert s.dtype == 'M8[ns]'

        s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]')
        assert isna(s[1])
        assert s.dtype == 'M8[ns]'

        # GH3416
        dates = [
            np.datetime64(datetime(2013, 1, 1)),
            np.datetime64(datetime(2013, 1, 2)),
            np.datetime64(datetime(2013, 1, 3)),
        ]

        s = Series(dates)
        assert s.dtype == 'M8[ns]'

        s.iloc[0] = np.nan
        assert s.dtype == 'M8[ns]'

        # GH3414 related
        pytest.raises(
            TypeError, lambda x: Series(Series(dates).astype('int') / 1000000,
                                        dtype='M8[ms]'))
        pytest.raises(TypeError, lambda x: Series(dates, dtype='datetime64'))

        # invalid dates can be help as object
        result = Series([datetime(2, 1, 1)])
        assert result[0] == datetime(2, 1, 1, 0, 0)

        result = Series([datetime(3000, 1, 1)])
        assert result[0] == datetime(3000, 1, 1, 0, 0)

        # don't mix types
        result = Series([Timestamp('20130101'), 1], index=['a', 'b'])
        assert result['a'] == Timestamp('20130101')
        assert result['b'] == 1

        # GH6529
        # coerce datetime64 non-ns properly
        dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M')
        values2 = dates.view(np.ndarray).astype('datetime64[ns]')
        expected = Series(values2, index=dates)

        for dtype in ['s', 'D', 'ms', 'us', 'ns']:
            values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype))
            result = Series(values1, dates)
            assert_series_equal(result, expected)

        # GH 13876
        # coerce to non-ns to object properly
        expected = Series(values2, index=dates, dtype=object)
        for dtype in ['s', 'D', 'ms', 'us', 'ns']:
            values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype))
            result = Series(values1, index=dates, dtype=object)
            assert_series_equal(result, expected)

        # leave datetime.date alone
        dates2 = np.array([d.date() for d in dates.to_pydatetime()],
                          dtype=object)
        series1 = Series(dates2, dates)
        tm.assert_numpy_array_equal(series1.values, dates2)
        assert series1.dtype == object

        # these will correctly infer a datetime
        s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'

        # tz-aware (UTC and other tz's)
        # GH 8411
        dr = date_range('20130101', periods=3)
        assert Series(dr).iloc[0].tz is None
        dr = date_range('20130101', periods=3, tz='UTC')
        assert str(Series(dr).iloc[0].tz) == 'UTC'
        dr = date_range('20130101', periods=3, tz='US/Eastern')
        assert str(Series(dr).iloc[0].tz) == 'US/Eastern'

        # non-convertible
        s = Series([1479596223000, -1479590, pd.NaT])
        assert s.dtype == 'object'
        assert s[2] is pd.NaT
        assert 'NaT' in str(s)

        # if we passed a NaT it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT])
        assert s.dtype == 'object'
        assert s[2] is pd.NaT
        assert 'NaT' in str(s)

        # if we passed a nan it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan])
        assert s.dtype == 'object'
        assert s[2] is np.nan
        assert 'NaN' in str(s)

    def test_constructor_with_datetime_tz(self):

        # 8260
        # support datetime64 with tz

        dr = date_range('20130101', periods=3, tz='US/Eastern')
        s = Series(dr)
        assert s.dtype.name == 'datetime64[ns, US/Eastern]'
        assert s.dtype == 'datetime64[ns, US/Eastern]'
        assert is_datetime64tz_dtype(s.dtype)
        assert 'datetime64[ns, US/Eastern]' in str(s)

        # export
        result = s.values
        assert isinstance(result, np.ndarray)
        assert result.dtype == 'datetime64[ns]'

        exp = pd.DatetimeIndex(result)
        exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz)
        tm.assert_index_equal(dr, exp)

        # indexing
        result = s.iloc[0]
        assert result == Timestamp('2013-01-01 00:00:00-0500',
                                   tz='US/Eastern',
                                   freq='D')
        result = s[0]
        assert result == Timestamp('2013-01-01 00:00:00-0500',
                                   tz='US/Eastern',
                                   freq='D')

        result = s[Series([True, True, False], index=s.index)]
        assert_series_equal(result, s[0:2])

        result = s.iloc[0:1]
        assert_series_equal(result, Series(dr[0:1]))

        # concat
        result = pd.concat([s.iloc[0:1], s.iloc[1:]])
        assert_series_equal(result, s)

        # short str
        assert 'datetime64[ns, US/Eastern]' in str(s)

        # formatting with NaT
        result = s.shift()
        assert 'datetime64[ns, US/Eastern]' in str(result)
        assert 'NaT' in str(result)

        # long str
        t = Series(date_range('20130101', periods=1000, tz='US/Eastern'))
        assert 'datetime64[ns, US/Eastern]' in str(t)

        result = pd.DatetimeIndex(s, freq='infer')
        tm.assert_index_equal(result, dr)

        # inference
        s = Series([
            pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
            pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')
        ])
        assert s.dtype == 'datetime64[ns, US/Pacific]'
        assert lib.infer_dtype(s) == 'datetime64'

        s = Series([
            pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
            pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')
        ])
        assert s.dtype == 'object'
        assert lib.infer_dtype(s) == 'datetime'

        # with all NaT
        s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')
        expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern'))
        assert_series_equal(s, expected)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units
        # gh-19223
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([1, 2, 3], dtype=arr_dtype)
        s = Series(arr)
        result = s.astype(dtype)
        expected = Series(arr.astype(dtype))

        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize('arg',
                             ['2013-01-01 00:00:00', pd.NaT, np.nan, None])
    def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg):
        # GH 17415: With naive string
        result = Series([arg], dtype='datetime64[ns, CET]')
        expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET')
        assert_series_equal(result, expected)

    def test_construction_interval(self):
        # construction from interval & array of intervals
        index = IntervalIndex.from_breaks(np.arange(3), closed='right')
        result = Series(index)
        repr(result)
        str(result)
        tm.assert_index_equal(Index(result.values), index)

        result = Series(index.values)
        tm.assert_index_equal(Index(result.values), index)

    def test_construction_consistency(self):

        # make sure that we are not re-localizing upon construction
        # GH 14928
        s = Series(pd.date_range('20130101', periods=3, tz='US/Eastern'))

        result = Series(s, dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.dt.tz_convert('UTC'), dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.values, dtype=s.dtype)
        tm.assert_series_equal(result, s)

    def test_constructor_periodindex(self):
        # GH7932
        # converting a PeriodIndex when put in a Series

        pi = period_range('20130101', periods=5, freq='D')
        s = Series(pi)
        expected = Series(pi.astype(object))
        assert_series_equal(s, expected)

        assert s.dtype == 'object'

    def test_constructor_dict(self):
        d = {'a': 0., 'b': 1., 'c': 2.}
        result = Series(d, index=['b', 'c', 'd', 'a'])
        expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a'])
        assert_series_equal(result, expected)

        pidx = tm.makePeriodIndex(100)
        d = {pidx[0]: 0, pidx[1]: 1}
        result = Series(d, index=pidx)
        expected = Series(np.nan, pidx)
        expected.iloc[0] = 0
        expected.iloc[1] = 1
        assert_series_equal(result, expected)

    def test_constructor_dict_order(self):
        # GH19018
        # initialization ordering: by insertion order if python>= 3.6, else
        # order by value
        d = {'b': 1, 'a': 0, 'c': 2}
        result = Series(d)
        if PY36:
            expected = Series([1, 0, 2], index=list('bac'))
        else:
            expected = Series([0, 1, 2], index=list('abc'))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
    def test_constructor_dict_nan_key(self, value):
        # GH 18480
        d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'}
        result = Series(d).sort_values()
        expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4])
        assert_series_equal(result, expected)

        # MultiIndex:
        d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'}
        result = Series(d).sort_values()
        expected = Series(['a', 'b', 'c'],
                          index=Index([(1, 1), (2, np.nan), (3, value)]))
        assert_series_equal(result, expected)

    def test_constructor_dict_datetime64_index(self):
        # GH 9456

        dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15']
        values = [42544017.198965244, 1234565, 40512335.181958228, -1]

        def create_data(constructor):
            return dict(zip((constructor(x) for x in dates_as_str), values))

        data_datetime64 = create_data(np.datetime64)
        data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d'))
        data_Timestamp = create_data(Timestamp)

        expected = Series(values, (Timestamp(x) for x in dates_as_str))

        result_datetime64 = Series(data_datetime64)
        result_datetime = Series(data_datetime)
        result_Timestamp = Series(data_Timestamp)

        assert_series_equal(result_datetime64, expected)
        assert_series_equal(result_datetime, expected)
        assert_series_equal(result_Timestamp, expected)

    def test_constructor_list_of_tuples(self):
        data = [(1, 1), (2, 2), (2, 3)]
        s = Series(data)
        assert list(s) == data

    def test_constructor_tuple_of_tuples(self):
        data = ((1, 1), (2, 2), (2, 3))
        s = Series(data)
        assert tuple(s) == data

    def test_constructor_dict_of_tuples(self):
        data = {(1, 2): 3, (None, 5): 6}
        result = Series(data).sort_values()
        expected = Series([3, 6],
                          index=MultiIndex.from_tuples([(1, 2), (None, 5)]))
        tm.assert_series_equal(result, expected)

    def test_constructor_set(self):
        values = set([1, 2, 3, 4, 5])
        pytest.raises(TypeError, Series, values)
        values = frozenset(values)
        pytest.raises(TypeError, Series, values)

    def test_fromDict(self):
        data = {'a': 0, 'b': 1, 'c': 2, 'd': 3}

        series = Series(data)
        assert tm.is_sorted(series.index)

        data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()}
        series = Series(data)
        assert series.dtype == np.object_

        data = {'a': 0, 'b': '1', 'c': '2', 'd': '3'}
        series = Series(data)
        assert series.dtype == np.object_

        data = {'a': '0', 'b': '1'}
        series = Series(data, dtype=float)
        assert series.dtype == np.float64

    def test_fromValue(self):

        nans = Series(np.NaN, index=self.ts.index)
        assert nans.dtype == np.float_
        assert len(nans) == len(self.ts)

        strings = Series('foo', index=self.ts.index)
        assert strings.dtype == np.object_
        assert len(strings) == len(self.ts)

        d = datetime.now()
        dates = Series(d, index=self.ts.index)
        assert dates.dtype == 'M8[ns]'
        assert len(dates) == len(self.ts)

        # GH12336
        # Test construction of categorical series from value
        categorical = Series(0, index=self.ts.index, dtype="category")
        expected = Series(0, index=self.ts.index).astype("category")
        assert categorical.dtype == 'category'
        assert len(categorical) == len(self.ts)
        tm.assert_series_equal(categorical, expected)

    def test_constructor_dtype_timedelta64(self):

        # basic
        td = Series([timedelta(days=i) for i in range(3)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([timedelta(days=1)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series(
            [timedelta(days=1),
             timedelta(days=2),
             np.timedelta64(1, 's')])

        assert td.dtype == 'timedelta64[ns]'

        # mixed with NaT
        td = Series([timedelta(days=1), NaT], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        td = Series([timedelta(days=1), np.nan], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        # improved inference
        # GH5689
        td = Series([np.timedelta64(300000000), NaT])
        assert td.dtype == 'timedelta64[ns]'

        # because iNaT is int, not coerced to timedelta
        td = Series([np.timedelta64(300000000), iNaT])
        assert td.dtype == 'object'

        td = Series([np.timedelta64(300000000), np.nan])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([pd.NaT, np.timedelta64(300000000)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([np.timedelta64(1, 's')])
        assert td.dtype == 'timedelta64[ns]'

        # these are frequency conversion astypes
        # for t in ['s', 'D', 'us', 'ms']:
        #    pytest.raises(TypeError, td.astype, 'm8[%s]' % t)

        # valid astype
        td.astype('int64')

        # invalid casting
        pytest.raises(TypeError, td.astype, 'int32')

        # this is an invalid casting
        def f():
            Series([timedelta(days=1), 'foo'], dtype='m8[ns]')

        pytest.raises(Exception, f)

        # leave as object here
        td = Series([timedelta(days=i) for i in range(3)] + ['foo'])
        assert td.dtype == 'object'

        # these will correctly infer a timedelta
        s = Series([None, pd.NaT, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([np.nan, pd.NaT, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([pd.NaT, None, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([pd.NaT, np.nan, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'

    # GH 16406
    def test_constructor_mixed_tz(self):
        s = Series(
            [Timestamp('20130101'),
             Timestamp('20130101', tz='US/Eastern')])
        expected = Series(
            [Timestamp('20130101'),
             Timestamp('20130101', tz='US/Eastern')],
            dtype='object')
        assert_series_equal(s, expected)

    def test_NaT_scalar(self):
        series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]')

        val = series[3]
        assert isna(val)

        series[2] = val
        assert isna(series[2])

    def test_NaT_cast(self):
        # GH10747
        result = Series([np.nan]).astype('M8[ns]')
        expected = Series([NaT])
        assert_series_equal(result, expected)

    def test_constructor_name_hashable(self):
        for n in [777, 777., 'name', datetime(2001, 11, 11), (1, ), u"\u05D0"]:
            for data in [[1, 2, 3], np.ones(3), {'a': 0, 'b': 1}]:
                s = Series(data, name=n)
                assert s.name == n

    def test_constructor_name_unhashable(self):
        for n in [['name_list'], np.ones(2), {1: 2}]:
            for data in [['name_list'], np.ones(2), {1: 2}]:
                pytest.raises(TypeError, Series, data, name=n)

    def test_auto_conversion(self):
        series = Series(list(date_range('1/1/2000', periods=10)))
        assert series.dtype == 'M8[ns]'

    def test_convert_non_ns(self):
        # convert from a numpy array of non-ns timedelta64
        arr = np.array([1, 2, 3], dtype='timedelta64[s]')
        s = Series(arr)
        expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s'))
        assert_series_equal(s, expected)

        # convert from a numpy array of non-ns datetime64
        # note that creating a numpy datetime64 is in LOCAL time!!!!
        # seems to work for M8[D], but not for M8[s]

        s = Series(
            np.array(['2013-01-01', '2013-01-02', '2013-01-03'],
                     dtype='datetime64[D]'))
        assert_series_equal(
            s, Series(date_range('20130101', periods=3, freq='D')))

        # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01
        # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]'))

        # assert_series_equal(s,date_range('20130101
        # 00:00:01',period=3,freq='s'))

    @pytest.mark.parametrize("index", [
        date_range('1/1/2000', periods=10),
        timedelta_range('1 day', periods=10),
        period_range('2000-Q1', periods=10, freq='Q')
    ],
                             ids=lambda x: type(x).__name__)
    def test_constructor_cant_cast_datetimelike(self, index):

        # floats are not ok
        msg = "Cannot cast {} to ".format(type(index).__name__)
        with tm.assert_raises_regex(TypeError, msg):
            Series(index, dtype=float)

        # ints are ok
        # we test with np.int64 to get similar results on
        # windows / 32-bit platforms
        result = Series(index, dtype=np.int64)
        expected = Series(index.astype(np.int64))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("index", [
        date_range('1/1/2000', periods=10),
        timedelta_range('1 day', periods=10),
        period_range('2000-Q1', periods=10, freq='Q')
    ],
                             ids=lambda x: type(x).__name__)
    def test_constructor_cast_object(self, index):
        s = Series(index, dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(pd.Index(index, dtype=object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(index.astype(object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

    def test_constructor_generic_timestamp_deprecated(self):
        # see gh-15524

        with tm.assert_produces_warning(FutureWarning):
            dtype = np.timedelta64
            s = Series([], dtype=dtype)

            assert s.empty
            assert s.dtype == 'm8[ns]'

        with tm.assert_produces_warning(FutureWarning):
            dtype = np.datetime64
            s = Series([], dtype=dtype)

            assert s.empty
            assert s.dtype == 'M8[ns]'

        # These timestamps have the wrong frequencies,
        # so an Exception should be raised now.
        msg = "cannot convert timedeltalike"
        with tm.assert_raises_regex(TypeError, msg):
            Series([], dtype='m8[ps]')

        msg = "cannot convert datetimelike"
        with tm.assert_raises_regex(TypeError, msg):
            Series([], dtype='M8[ps]')

    @pytest.mark.parametrize('dtype', [None, 'uint8', 'category'])
    def test_constructor_range_dtype(self, dtype):
        # GH 16804
        expected = Series([0, 1, 2, 3, 4], dtype=dtype or 'int64')
        result = Series(range(5), dtype=dtype)
        tm.assert_series_equal(result, expected)
예제 #4
0
 def test_unique_index(self):
     cases = [pd.Index([1, 2, 3]), pd.RangeIndex(0, 3)]
     for case in cases:
         self.assertTrue(case.is_unique)
         tm.assert_numpy_array_equal(case.duplicated(),
                                     np.array([False, False, False]))
예제 #5
0
def SWBDAnalysis(swbdCorpus,
                 speakerData,
                 labelWanted=None,
                 numberOfWords=5,
                 clusterKMean=0):
    """

    :param swbdCorpus:
    :param speakerData:
    :param labelWanted:
    :param numberOfWords:
    :param clusterKMean: if 0 we will use label
    :return:
    """
    dataframe = None

    if not os.path.isfile(path.join(getPathToSerialized(), "swbdDataframe")):

        frequencyAnalysis = freqAnalysis(swbdCorpus,
                                         numberOfWords=numberOfWords,
                                         printMostCommon=True)

        dimensionAnalysis = analysisInManyDimensions([swbdCorpus])
        dataframe = pd.concat([frequencyAnalysis, dimensionAnalysis],
                              axis=1,
                              sort=False)

        f = open(path.join(getPathToSerialized(), "swbdDataframe"), "wb")
        pickle.dump(dataframe, f)
        f.close()
    else:
        f = open(path.join(getPathToSerialized(), "swbdDataframe"), "rb")
        dataframe = pickle.load(f)
        f.close()

    # grouping files by speaker
    eachFilespeakerID = swbdCorpus.getSpeakerByFile()
    dataframe["idSpeaker"] = eachFilespeakerID
    dataframe = dataframe.groupby(['idSpeaker']).mean()

    if clusterKMean == 0 and labelWanted is not None:
        # we cluster by labelWanted
        filteredSpeaker = {}
        for speaker in speakerData:
            for info in speakerData[speaker]:
                if info == labelWanted:
                    filteredSpeaker[speaker] = speakerData[speaker][info]

        dataframe["label"] = pd.Series(filteredSpeaker)
        #dataframe.index = pd.RangeIndex(len(dataframe.index)) does not work
        dataframe.index = np.arange(len(dataframe))

    elif clusterKMean > 0:

        dataframe.index = pd.RangeIndex(len(
            dataframe.index))  # restarting the indexes is important
        # because the current indices are speakers's identification number

        kmeanModel = KMeans(n_clusters=clusterKMean).fit(dataframe)
        clusters = kmeanModel.predict(
            dataframe)  # associate a cluster to each speaker
        dataframe["label"] = pd.Series(dict(
            enumerate(clusters)))  # going from a list to a dict and assigning
    else:
        print(
            "need either label wanted or clusterKMean in function SWBDAnalysisSpeakers"
        )

    f = open(path.join(getPathToSerialized(), "swbdDataframeBySpeaker"), "wb")
    pickle.dump(dataframe, f)
    f.close()

    return dataframe, eachFilespeakerID
예제 #6
0
 def to_pandas(self):
     return pd.RangeIndex(start=self._start,
                          stop=self._stop,
                          dtype=self.dtype)
예제 #7
0
파일: io.py 프로젝트: robindar/wot
def read_anndata(path, backed=None):
    path = str(path)
    tmp_path = None
    if path.startswith('gs://'):
        tmp_path = download_gs_url(path)
        path = tmp_path
    basename_and_extension = get_filename_and_extension(path)
    ext = basename_and_extension[1]
    if ext == 'mtx':
        x = scipy.io.mmread(path)
        x = scipy.sparse.csr_matrix(x.T)
        # look for .barcodes.txt and .genes.txt
        import itertools
        sp = os.path.split(path)
        obs = None

        for sep_ext in itertools.product(['.', '_', '-'], ['tsv', 'txt']):
            for prefix in ['', basename_and_extension[0] + sep_ext[0]]:
                f = os.path.join(sp[0], prefix + 'barcodes.' + sep_ext[1])
                if os.path.isfile(f) or os.path.isfile(f + '.gz'):
                    obs = pd.read_csv(f if os.path.isfile(f) else f + '.gz',
                                      index_col=0,
                                      sep='\t',
                                      header=None)
                    break
        var = None
        for sep_ext in itertools.product(['.', '_', '-'], ['tsv', 'txt']):
            for prefix in ['', basename_and_extension[0] + sep_ext[0]]:
                f = os.path.join(sp[0], prefix + 'genes.' + sep_ext[1])
                if os.path.isfile(f) or os.path.isfile(f + '.gz'):
                    var = pd.read_csv(f if os.path.isfile(f) else f + '.gz',
                                      index_col=0,
                                      sep='\t',
                                      header=None)
                    break

        if var is None:
            print(basename_and_extension[0] + '.genes.txt not found')
            var = pd.DataFrame(
                index=pd.RangeIndex(start=0, stop=x.shape[1], step=1))
        if obs is None:
            print(basename_and_extension[0] + '.barcodes.txt not found')
            obs = pd.DataFrame(
                index=pd.RangeIndex(start=0, stop=x.shape[0], step=1))

        cell_count, gene_count = x.shape
        if len(obs) != cell_count:
            raise ValueError("Wrong number of cells : matrix has {} cells, barcodes file has {}" \
                             .format(cell_count, len(obs)))
        if len(var) != gene_count:
            raise ValueError("Wrong number of genes : matrix has {} genes, genes file has {}" \
                             .format(gene_count, len(var)))

        return anndata.AnnData(X=x, obs=obs, var=var)
    elif ext == 'h5':
        return sc.read_10x_h5(path, genome=None, gex_only=True)
    elif ext == 'npz':
        obj = np.load(path)
        if tmp_path is not None:
            os.remove(tmp_path)
        return anndata.AnnData(X=obj['x'],
                               obs=pd.DataFrame(index=obj['rid']),
                               var=pd.DataFrame(index=obj['cid']))
    elif ext == 'npy':
        x = np.load(path)
        if tmp_path is not None:
            os.remove(tmp_path)
        return anndata.AnnData(
            X=x,
            obs=pd.DataFrame(
                index=pd.RangeIndex(start=0, stop=x.shape[0], step=1)),
            var=pd.DataFrame(
                index=pd.RangeIndex(start=0, stop=x.shape[1], step=1)))
    elif ext == 'loom':
        # in loom file, convention is rows are genes :(
        # return anndata.read_loom(path, X_name='matrix', sparse=True)
        f = h5py.File(path, 'r')
        x = f['/matrix']
        is_x_sparse = x.attrs.get('sparse')
        if is_x_sparse:
            # read in blocks of 1000
            chunk_start = 0
            nrows = x.shape[0]
            chunk_step = min(nrows, 1000)
            chunk_stop = chunk_step
            nchunks = int(np.ceil(max(1, nrows / chunk_step)))
            sparse_arrays = []
            for chunk in range(nchunks):
                chunk_stop = min(nrows, chunk_stop)
                subset = scipy.sparse.csr_matrix(x[chunk_start:chunk_stop])
                sparse_arrays.append(subset)
                chunk_start += chunk_step
                chunk_stop += chunk_step

            x = scipy.sparse.vstack(sparse_arrays)
        else:
            x = x[()]
        row_meta = {}
        row_attrs = f['/row_attrs']
        for key in row_attrs:
            values = row_attrs[key][()]
            if values.dtype.kind == 'S':
                values = values.astype(str)
            row_meta[key] = values
        row_meta = pd.DataFrame(data=row_meta)
        if row_meta.get('id') is not None:
            row_meta.set_index('id', inplace=True)
        elif row_meta.shape[1] == 1:
            row_meta.set_index(row_meta.columns[0], inplace=True)
        col_meta = {}
        col_attrs = f['/col_attrs']
        for key in col_attrs:
            values = col_attrs[key][()]
            if values.dtype.kind == 'S':
                values = values.astype(str)
            col_meta[key] = values
        col_meta = pd.DataFrame(data=col_meta)
        if col_meta.get('id') is not None:
            col_meta.set_index('id', inplace=True)
        elif col_meta.shape[1] == 1:
            col_meta.set_index(col_meta.columns[0], inplace=True)
        f.close()
        return anndata.AnnData(X=x, obs=row_meta, var=col_meta)
    elif ext == 'h5ad':
        return anndata.read_h5ad(path, backed=backed)
    elif ext == 'hdf5' or ext == 'h5':
        return anndata.read_hdf(path)
    elif ext == 'gct':
        ds = wot.io.read_gct(path)
        if tmp_path is not None:
            os.remove(tmp_path)
        return ds
    else:  # txt
        df = pd.read_csv(path,
                         engine='python',
                         header=0,
                         sep=None,
                         index_col=0)
        if tmp_path is not None:
            os.remove(tmp_path)
        return anndata.AnnData(X=df.values,
                               obs=pd.DataFrame(index=df.index),
                               var=pd.DataFrame(index=df.columns))
                    f.write('packageName=%s (%s) %s\n' %
                            (row['title'], tag, cd))
                    f.write('text=' + ' '.join(urls) + '\n')
                    f.write('comment=' + row['url'] + '\n')

            if nocds:
                filename = 'crawljob/hulkpop-%05d-%s.crawljob' % (row.name,
                                                                  tag)
                with open(filename, mode='w', encoding='utf8') as f:
                    f.write('packageName=' + row['title'] + ' (' + tag + ')\n')
                    f.write('text=' + ' '.join(nocds) + '\n')
                    f.write('comment=' + row['url'] + '\n')
            error = False
    pbar.update(1)
    return bool(error or row['error'])


df = dd.read_csv('out/hulkpop-links-*.csv',
                 encoding='UTF-8',
                 dtype='object',
                 keep_default_na=False).compute()
df = df[df['error'] != 'True']
df.index = pd.RangeIndex(0, len(df))

pbar = tqdm(total=len(df), ncols=80)
errors = df[df.apply(create_createjob, axis=1)]
errors.to_csv('errors.csv',
              encoding='UTF-8',
              index=False,
              line_terminator='\n')
예제 #9
0
This example shows how you can use selections and layers to create a
multi-line tooltip that tracks the x position of the cursor.

To find the x-position of the cursor, we employ a little trick: we add some
transparent points with only an x encoding (no y encoding) and tie a
*nearest* selection to these, tied to the "x" field.
"""
# category: interactive charts
import altair as alt
import pandas as pd
import numpy as np

np.random.seed(42)
data = pd.DataFrame(np.cumsum(np.random.randn(100, 3), 0).round(2),
                    columns=['A', 'B', 'C'],
                    index=pd.RangeIndex(100, name='x'))
data = data.reset_index().melt('x', var_name='category', value_name='y')

# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection(type='single',
                        nearest=True,
                        on='mouseover',
                        fields=['x'],
                        empty='none')

# The basic line
line = alt.Chart().mark_line(interpolate='basis').encode(x='x:Q',
                                                         y='y:Q',
                                                         color='category:N')

# Transparent selectors across the chart. This is what tells us
예제 #10
0
        ],
    ],
)
def test_infer_nullable_series_schema_statistics(null_index, series,
                                                 expectation):
    """Test nullable series statistics are correctly inferred."""
    series.iloc[null_index] = None
    statistics = schema_statistics.infer_series_statistics(series)
    assert statistics == expectation


@pytest.mark.parametrize(
    "index, expectation",
    [
        [
            pd.RangeIndex(20),
            [{
                "name": None,
                "pandas_dtype": PandasDtype.Int,
                "nullable": False,
                "checks": {
                    "greater_than_or_equal_to": 0,
                    "less_than_or_equal_to": 19,
                },
            }],
        ],
        [
            pd.Index([1, 2, 3], name="int_index"),
            [{
                "name": "int_index",
                "pandas_dtype": PandasDtype.Int,
예제 #11
0
def compute_validation_summary(ot_model, day_triplets=None, interp_size=10000, compute_full_distances=False):
    """
    Compute the validation summary for the given OTModel

    Parameters
    ----------
    ot_model : wot.OTModel
        The OTModel to validate
    day_triplets : list of (float, float, float)
        List of day triplets (t0, t0.5, t1) or None to use all consecutive triplets
    interp_size : int, optional
        The number of cells in the interpolated population
    compute_full_distances : bool
        Whether to compute full distances

    Returns
    -------
    validation_summary : pandas.DataFrame
        The validation summary
    """
    if ot_model.covariate_field not in ot_model.matrix.obs:
        ot_model.matrix.obs['covariate'] = 1
    if day_triplets is None:
        day_triplets = []
        unique_times = np.array(ot_model.timepoints)
        for i in range(len(unique_times) - 2):
            t0 = unique_times[i]
            t05 = unique_times[i + 1]
            t1 = unique_times[i + 2]
            day_triplets.append((t0, t05, t1))

    day_pairs = {}
    for triplet in day_triplets:
        day_pairs[(triplet[0], triplet[2])] = {}
    ot_model.day_pairs = day_pairs
    has_covariate = ot_model.covariate_field is not None
    if not has_covariate and not compute_full_distances:
        raise ValueError('No covariate specified. Please provide a covariate or compute full distances')

    summary_list = []
    # 'P': ["#e41a1c", "between real batches"],
    # 'I': ["#377eb8", "between interpolated and real batch"],
    # 'F': ["#4daf4a", "between first and real "],
    # 'L': ["#984ea3", "between last and real"],
    # 'R': ["#ff7f00", "between random (no growth) and real"],
    # 'Rg': ["#ffff33", "between random (with growth) and real"]

    local_pca = ot_model.ot_config['local_pca']

    for triplet in day_triplets:
        t0, t05, t1 = triplet
        interp_frac = (t05 - t0) / (t1 - t0)

        p0_ds = ot_model.matrix[ot_model.matrix.obs[ot_model.day_field] == float(t0), :]
        p05_ds = ot_model.matrix[ot_model.matrix.obs[ot_model.day_field] == float(t05), :]
        p1_ds = ot_model.matrix[ot_model.matrix.obs[ot_model.day_field] == float(t1), :]

        if local_pca > 0:
            matrices = list()
            matrices.append(p0_ds.X if not scipy.sparse.isspmatrix(p0_ds.X) else p0_ds.X.toarray())
            matrices.append(p1_ds.X if not scipy.sparse.isspmatrix(p1_ds.X) else p1_ds.X.toarray())
            p0_pca, p1_pca, pca, mean_shift = wot.ot.compute_pca(p0_ds.X, p1_ds.X, local_pca)
            p0_ds = anndata.AnnData(p0_pca, obs=p0_ds.obs,
                                    var=pd.DataFrame(index=pd.RangeIndex(start=0, stop=local_pca, step=1)))
            p1_ds = anndata.AnnData(p1_pca, obs=p1_ds.obs,
                                    var=pd.DataFrame(index=pd.RangeIndex(start=0, stop=local_pca, step=1)))

            eigenvals = np.diag(pca.singular_values_)
            U = np.vstack(matrices).T.dot(pca.components_.T).dot(np.diag(1 / pca.singular_values_))
            y = p05_ds.X - mean_shift

            p05_ds = anndata.AnnData(np.diag(1 / pca.singular_values_).dot(U.T.dot(y.T)).T, obs=p05_ds.obs,
                                     var=pd.DataFrame(index=pd.RangeIndex(start=0, stop=local_pca, step=1)))

        if compute_full_distances:
            tmap_full = ot_model.compute_transport_map(t0, t1)

            def update_full_summary(pop, t, name, pop2=p05_ds.X):
                dist = wot.ot.earth_mover_distance(pop, pop2, eigenvals if local_pca > 0 else None)
                summary_list.append(
                    {'interval_start': t0,
                     'interval_mid': t05,
                     'interval_end': t1,
                     't0': t,
                     't1': t05,
                     'cv0': '',
                     'cv1': '',
                     'name': name,
                     'distance': dist,
                     'full': True})

            if ot_model.cell_growth_rate_field in ot_model.matrix.obs:
                r05_with_growth = wot.ot.interpolate_randomly_with_growth(p0_ds.X, p1_ds.X, interp_frac, interp_size,
                                                                          p0_ds.obs[
                                                                              ot_model.cell_growth_rate_field].values ** (
                                                                              interp_frac))
                update_full_summary(r05_with_growth, t05, 'Rg')
            r05_no_growth = wot.ot.interpolate_randomly(p0_ds.X, p1_ds.X, interp_frac, interp_size)
            update_full_summary(r05_no_growth, t05, 'R')
            try:
                i05 = wot.ot.interpolate_with_ot(p0_ds.X, p1_ds.X, tmap_full.X, interp_frac,
                                                 interp_size)  # TODO handle downsampling cells case
                update_full_summary(i05, t05, 'I')
                update_full_summary(i05, t05, 'I1', p0_ds.X)
                update_full_summary(i05, t05, 'I2', p1_ds.X)
            except ValueError:
                pass

            update_full_summary(p0_ds.X, t0, 'F')
            update_full_summary(p1_ds.X, t1, 'L')
            update_full_summary(p0_ds.X, t1, 'A', p1_ds.X)

        if not has_covariate:
            continue
        p0 = wot.split_anndata(p0_ds, ot_model.covariate_field)
        p05 = wot.split_anndata(p05_ds, ot_model.covariate_field)
        p1 = wot.split_anndata(p1_ds, ot_model.covariate_field)
        for cv05 in p05.keys():
            p05_x = p05[cv05].X
            seen_first = set()
            seen_last = set()

            def distance_to_p05(pop, t, name, cv):
                dist = wot.ot.earth_mover_distance(pop, p05_x, eigenvals if local_pca > 0 else None)
                summary_list.append(
                    {'interval_start': t0,
                     'interval_mid': t05,
                     'interval_end': t1,
                     't0': t,
                     't1': t05,
                     'cv0': cv,
                     'cv1': cv05,
                     'name': name,
                     'distance': dist,
                     'full': False})

            # p05_x = wot.ot.pca_transform(pca, mean, p05[cv05].X)

            for cv05_2 in p05.keys():  # distance between batches
                if cv05_2 != cv05:
                    distance_to_p05(p05[cv05_2].X, t05, 'P', cv05_2)

            for cv0, cv1 in itertools.product(p0.keys(), p1.keys()):
                tmap = ot_model.compute_transport_map(t0, t1, covariate=(cv0, cv1))
                if tmap is None:
                    # no data for combination of day and covariate
                    continue
                # interp_size = (len(p0[cv0]) + len(p1[cv1])) / 2
                # pca, mean = wot.ot.get_pca(local_pca, p0[cv0].X, p1[cv1].X)
                # p0_x = wot.ot.pca_transform(pca, mean, p0[cv0].X)
                # p1_x = wot.ot.pca_transform(pca, mean, p1[cv1].X)
                p0_x = p0[cv0].X
                p1_x = p1[cv1].X
                i05 = wot.ot.interpolate_with_ot(p0_x, p1_x, tmap.X, interp_frac, interp_size)
                if ot_model.cell_growth_rate_field in ot_model.matrix.obs:
                    r05_with_growth = wot.ot.interpolate_randomly_with_growth(p0_x, p1_x, interp_frac, interp_size,
                                                                              p0[cv0].obs[
                                                                                  ot_model.cell_growth_rate_field].values ** (
                                                                                  interp_frac))
                    distance_to_p05(r05_with_growth, t05, 'Rg', (cv0, cv1))

                r05_no_growth = wot.ot.interpolate_randomly(p0_x, p1_x, interp_frac, interp_size)
                distance_to_p05(i05, t05, 'I', (cv0, cv1))

                distance_to_p05(r05_no_growth, t05, 'R', (cv0, cv1))

                if cv0 == cv05 and cv0 not in seen_first:
                    seen_first.add(cv0)
                    distance_to_p05(p0_x, t0, 'F', cv0)
                if cv1 == cv05 and cv1 not in seen_last:
                    seen_last.add(cv1)
                    distance_to_p05(p1_x, t1, 'L', cv1)

                # if save_interpolated:
                #     prefix = os.path.join(tmap_dir, tmap_prefix)
                #     prefix += '_{}_{}_cv{}_cv{}'.format(t0, t1, cv0, cv1)
                #     wot.io.write_dataset(wot.dataset_from_x(i05),
                #                          prefix + '_interp.txt')
                # wot.io.write_dataset(wot.dataset_from_x(r05),
                #                      prefix + '_random.txt')

    return pd.DataFrame(summary_list)
예제 #12
0
파일: test_core.py 프로젝트: fyrestone/mars
def test_to_frame_or_series(setup):
    raw = pd.Series(np.random.rand(10), name='col')
    series = Series(raw)

    r = series.to_frame()
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(raw.to_frame(), result)

    r = series.to_frame(name='new_name')
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(raw.to_frame(name='new_name'), result)

    series = series[series > 0.1]
    r = series.to_frame(name='new_name')
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(raw[raw > 0.1].to_frame(name='new_name'),
                                  result)

    raw = pd.Index(np.random.rand(10), name='col')
    index = Index(raw)

    r = index.to_frame()
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(raw.to_frame(), result)

    r = index.to_frame(index=False)
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(raw.to_frame(index=False), result)

    r = index.to_frame(name='new_name')
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(raw.to_frame(name='new_name'), result)

    r = index.to_series()
    result = r.execute().fetch()
    pd.testing.assert_series_equal(raw.to_series(), result)

    r = index.to_series(index=pd.RangeIndex(0, 10))
    result = r.execute().fetch()
    pd.testing.assert_series_equal(raw.to_series(index=pd.RangeIndex(0, 10)),
                                   result)

    r = index.to_series(name='new_name')
    result = r.execute().fetch()
    pd.testing.assert_series_equal(raw.to_series(name='new_name'), result)

    raw = pd.MultiIndex.from_tuples([('A', 'E'), ('B', 'F'), ('C', 'G')])
    index = Index(raw, tupleize_cols=True)

    r = index.to_frame()
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(raw.to_frame(), result)

    with pytest.raises(TypeError):
        index.to_frame(name='XY')

    with pytest.raises(ValueError):
        index.to_frame(name=['X', 'Y', 'Z'])

    r = index.to_frame(name=['X', 'Y'])
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(raw.to_frame(name=['X', 'Y']), result)

    r = index.to_series(name='new_name')
    result = r.execute().fetch()
    pd.testing.assert_series_equal(raw.to_series(name='new_name'), result)
예제 #13
0
    def execute(cls, ctx, op: 'DataFrameReadSQL'):
        import sqlalchemy as sa

        def _adapt_datetime(dt):
            if isinstance(dt, np.datetime64):
                return dt.astype('<M8[ms]').astype(datetime.datetime)
            elif isinstance(dt, pd.Timestamp):
                return dt.to_pydatetime()
            return dt

        out = op.outputs[0]

        engine = sa.create_engine(op.con, **(op.engine_kwargs or dict()))
        try:
            selectable, _ = op._get_selectable(engine)

            columns = [selectable.columns[col] for col in op.columns]
            column_names = set(op.columns)
            if op.index_col:
                for icol in op.index_col:
                    if icol not in column_names:
                        columns.append(selectable.columns[icol])

            # convert to python timestamp in case np / pd time types not handled
            op._low_limit = _adapt_datetime(op._low_limit)
            op._high_limit = _adapt_datetime(op._high_limit)

            query = sa.sql.select(columns)
            if op.method == 'partition':
                part_col = selectable.columns[op.partition_col]
                if op.left_end:
                    query = query.where(part_col < op.high_limit)
                elif op.right_end:
                    query = query.where(part_col >= op.low_limit)
                else:
                    query = query.where((part_col >= op.low_limit) & (part_col < op.high_limit))

            if hasattr(selectable, 'primary_key') and len(selectable.primary_key) > 0:
                # if table has primary key, sort as the order
                query = query.order_by(*list(selectable.primary_key))
            elif op.index_col:
                # if no primary key, sort as the index_col
                query = query.order_by(
                    *[selectable.columns[col] for col in op.index_col])
            else:
                # at last, we sort by all the columns
                query = query.order_by(*columns)

            if op.method == 'offset':
                query = query.limit(out.shape[0])
                if op.offset > 0:
                    query = query.offset(op.offset)

            df = pd.read_sql(query, engine, index_col=op.index_col,
                             coerce_float=op.coerce_float,
                             parse_dates=op.parse_dates)
            if op.method == 'offset' and op.index_col is None and op.offset > 0:
                df.index = pd.RangeIndex(op.offset, op.offset + out.shape[0])
            ctx[out.key] = df
        finally:
            engine.dispose()
예제 #14
0
    def __call__(self, test_rows, chunk_size):
        import sqlalchemy as sa
        from sqlalchemy.sql import elements

        with create_sa_connection(self._con, **(self._engine_kwargs or dict())) as con:
            self._con = str(con.engine.url)

            selectable, src_columns = self._get_selectable(con)

            # process index_col
            index_col = self._index_col
            if index_col is not None:
                if not isinstance(index_col, (list, tuple)):
                    index_col = (index_col,)
                new_index_col = []
                sa_index_col = []
                for col in index_col:
                    if isinstance(col, (sa.Column, elements.Label)):
                        new_index_col.append(col.name)
                        sa_index_col.append(col)
                    elif isinstance(col, str):
                        sa_index_col.append(selectable.columns[col])
                        new_index_col.append(col)
                    elif col is not None:
                        raise TypeError('unknown index_col type: {}'.format(type(col)))
                self._index_col = new_index_col
                index_col = sa_index_col

            # process columns
            columns = self._columns if self._columns is not None else src_columns
            new_columns = []
            sa_columns = []
            for col in columns:
                if isinstance(col, str):
                    new_columns.append(col)
                    sa_columns.append(selectable.columns[col])
                else:
                    new_columns.append(col.name)
                    sa_columns.append(col)
            self._columns = new_columns
            if self._index_col is not None:
                for icol in index_col:
                    sa_columns.append(icol)

            test_df, shape = self._collect_info(con, selectable, sa_columns, test_rows)

            if self.method == 'partition':
                if not self.index_col or self.partition_col not in self.index_col:
                    part_frame = test_df
                else:
                    part_frame = test_df.index.to_frame()

                if not issubclass(part_frame[self.partition_col].dtype.type, (np.number, np.datetime64)):
                    raise TypeError('Type of partition column should be numeric or datetime, '
                                    'now it is %r' % test_df[self.partition_col].dtype)

            if isinstance(test_df.index, pd.RangeIndex):
                index_value = parse_index(pd.RangeIndex(shape[0] if not np.isnan(shape[0]) else -1),
                                          str(selectable), self._con)
            else:
                index_value = parse_index(test_df.index)
            columns_value = parse_index(test_df.columns, store_data=True)
            return self.new_dataframe(None, shape=shape, dtypes=test_df.dtypes,
                                      index_value=index_value,
                                      columns_value=columns_value,
                                      raw_chunk_size=chunk_size)
예제 #15
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pandas as pd
from numpy.random import permutation, randn

from legate import pandas as lp
from tests.utils import equals

n = 17
indices = [pd.RangeIndex(1, n + 1), pd.Index(permutation(n))]

for index in indices:
    print(f"Index: {index}")
    df1 = pd.DataFrame({1: randn(n), 2: randn(n), 5: randn(n)}, index=index)
    ldf1 = lp.DataFrame(df1)
    df2 = pd.DataFrame({1: randn(n), 2: randn(n), 5: randn(n)}, index=index)

    out_pd = df1 + df2
    out_lp = ldf1 + df2
    assert equals(out_lp, out_pd)

    out_pd = df1 + df2.values
    out_lp = ldf1 + df2.values
    assert equals(out_lp, out_pd)
log.info(f"{tweet_text_dataframe.columns}\n")

# Drop any NaN or empty Tweet rows in dataframe (or else CountVectorizer will blow up).
tweet_text_dataframe = tweet_text_dataframe.dropna()

# Print shape and column names.
log.info(
    f"\nThe shape of the Tweet text dataframe with NaN (empty) rows dropped:")
log.info(f"{tweet_text_dataframe.shape}\n")
log.info(
    f"\nThe columns of the Tweet text dataframe with NaN (empty) rows dropped:"
)
log.info(f"{tweet_text_dataframe.columns}\n")

# Reindex everything.
tweet_text_dataframe.index = pd.RangeIndex(len(tweet_text_dataframe.index))

# Assign column names.
tweet_text_dataframe_column_names = [
    'text_derived', 'text_derived_preprocessed', 'text_derived_postprocessed'
]

# Rename column in dataframe.
tweet_text_dataframe.columns = tweet_text_dataframe_column_names

# Create input feature.
selected_features = tweet_text_dataframe[['text_derived_postprocessed']]
processed_features = selected_features.copy()

# Check what we are using as inputs.
log.info(f"\nA sample Tweet in our input feature:")
예제 #17
0
class TestNumericArraylikeArithmeticWithTimedeltaLike(object):

    # TODO: also check name retentention
    @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series])
    @pytest.mark.parametrize('left', [
        pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype)
                                      for dtype in ['i1', 'i2', 'i4', 'i8',
                                                    'u1', 'u2', 'u4', 'u8',
                                                    'f2', 'f4', 'f8']
                                      for cls in [pd.Series, pd.Index]],
        ids=lambda x: type(x).__name__ + str(x.dtype))
    def test_mul_td64arr(self, left, box_cls):
        # GH#22390
        right = np.array([1, 2, 3], dtype='m8[s]')
        right = box_cls(right)

        expected = pd.TimedeltaIndex(['10s', '40s', '90s'])
        if isinstance(left, pd.Series) or box_cls is pd.Series:
            expected = pd.Series(expected)

        result = left * right
        tm.assert_equal(result, expected)

        result = right * left
        tm.assert_equal(result, expected)

    # TODO: also check name retentention
    @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series])
    @pytest.mark.parametrize('left', [
        pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype)
                                      for dtype in ['i1', 'i2', 'i4', 'i8',
                                                    'u1', 'u2', 'u4', 'u8',
                                                    'f2', 'f4', 'f8']
                                      for cls in [pd.Series, pd.Index]],
        ids=lambda x: type(x).__name__ + str(x.dtype))
    def test_div_td64arr(self, left, box_cls):
        # GH#22390
        right = np.array([10, 40, 90], dtype='m8[s]')
        right = box_cls(right)

        expected = pd.TimedeltaIndex(['1s', '2s', '3s'])
        if isinstance(left, pd.Series) or box_cls is pd.Series:
            expected = pd.Series(expected)

        result = right / left
        tm.assert_equal(result, expected)

        result = right // left
        tm.assert_equal(result, expected)

        with pytest.raises(TypeError):
            left / right

        with pytest.raises(TypeError):
            left // right

    # TODO: de-duplicate with test_numeric_arr_mul_tdscalar
    def test_ops_series(self):
        # regression test for G#H8813
        td = Timedelta('1 day')
        other = pd.Series([1, 2])
        expected = pd.Series(pd.to_timedelta(['1 day', '2 days']))
        tm.assert_series_equal(expected, td * other)
        tm.assert_series_equal(expected, other * td)

    # TODO: also test non-nanosecond timedelta64 and Tick objects;
    #  see test_numeric_arr_rdiv_tdscalar for note on these failing
    @pytest.mark.parametrize('scalar_td', [
        Timedelta(days=1),
        Timedelta(days=1).to_timedelta64(),
        Timedelta(days=1).to_pytimedelta()],
        ids=lambda x: type(x).__name__)
    def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box):
        # GH#19333
        index = numeric_idx

        expected = pd.timedelta_range('0 days', '4 days')

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = index * scalar_td
        tm.assert_equal(result, expected)

        commute = scalar_td * index
        tm.assert_equal(commute, expected)

    def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box):

        if box is not pd.Index and isinstance(three_days, pd.offsets.Tick):
            raise pytest.xfail("Tick division not implemented")

        index = numeric_idx[1:3]

        expected = TimedeltaIndex(['3 Days', '36 Hours'])

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = three_days / index
        tm.assert_equal(result, expected)

        with pytest.raises(TypeError):
            index / three_days

    @pytest.mark.parametrize('other', [
        pd.Timedelta(hours=31),
        pd.Timedelta(hours=31).to_pytimedelta(),
        pd.Timedelta(hours=31).to_timedelta64(),
        pd.Timedelta(hours=31).to_timedelta64().astype('m8[h]'),
        np.timedelta64('NaT'),
        np.timedelta64('NaT', 'D'),
        pd.offsets.Minute(3),
        pd.offsets.Second(0)])
    def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box):
        left = tm.box_expected(numeric_idx, box)
        with pytest.raises(TypeError):
            left + other
        with pytest.raises(TypeError):
            other + left
        with pytest.raises(TypeError):
            left - other
        with pytest.raises(TypeError):
            other - left
예제 #18
0
    def testFromTensorExecution(self):
        tensor = mt.random.rand(10, 10, chunk_size=5)
        df = dataframe_from_tensor(tensor)
        tensor_res = self.executor.execute_tensor(tensor, concat=True)[0]
        pdf_expected = pd.DataFrame(tensor_res)
        df_result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10))
        pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10))
        pd.testing.assert_frame_equal(df_result, pdf_expected)

        # test converted with specified index_value and columns
        tensor2 = mt.random.rand(2, 2, chunk_size=1)
        df2 = dataframe_from_tensor(tensor2,
                                    index=pd.Index(['a', 'b']),
                                    columns=pd.Index([3, 4]))
        df_result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b']))
        pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4]))

        # test converted from 1-d tensor
        tensor3 = mt.array([1, 2, 3])
        df3 = dataframe_from_tensor(tensor3)
        result3 = self.executor.execute_dataframe(df3, concat=True)[0]
        pdf_expected = pd.DataFrame(np.array([1, 2, 3]))
        pd.testing.assert_frame_equal(pdf_expected, result3)

        # test converted from identical chunks
        tensor4 = mt.ones((10, 10), chunk_size=3)
        df4 = dataframe_from_tensor(tensor4)
        result4 = self.executor.execute_dataframe(df4, concat=True)[0]
        pdf_expected = pd.DataFrame(
            self.executor.execute_tensor(tensor4, concat=True)[0])
        pd.testing.assert_frame_equal(pdf_expected, result4)

        # from tensor with given index
        tensor5 = mt.ones((10, 10), chunk_size=3)
        df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2))
        result5 = self.executor.execute_dataframe(df5, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(
            tensor5, concat=True)[0],
                                    index=np.arange(0, 20, 2))
        pd.testing.assert_frame_equal(pdf_expected, result5)

        # from tensor with given index that is a tensor
        raw7 = np.random.rand(10, 10)
        tensor7 = mt.tensor(raw7, chunk_size=3)
        index_raw7 = np.random.rand(10)
        index7 = mt.tensor(index_raw7, chunk_size=4)
        df7 = dataframe_from_tensor(tensor7, index=index7)
        result7 = self.executor.execute_dataframe(df7, concat=True)[0]
        pdf_expected = pd.DataFrame(raw7, index=index_raw7)
        pd.testing.assert_frame_equal(pdf_expected, result7)

        # from tensor with given index is a md.Index
        raw10 = np.random.rand(10, 10)
        tensor10 = mt.tensor(raw10, chunk_size=3)
        index10 = md.date_range('2020-1-1', periods=10, chunk_size=3)
        df10 = dataframe_from_tensor(tensor10, index=index10)
        result10 = self.executor.execute_dataframe(df10, concat=True)[0]
        pdf_expected = pd.DataFrame(raw10,
                                    index=pd.date_range('2020-1-1',
                                                        periods=10))
        pd.testing.assert_frame_equal(pdf_expected, result10)

        # from tensor with given columns
        tensor6 = mt.ones((10, 10), chunk_size=3)
        df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij'))
        result6 = self.executor.execute_dataframe(df6, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(
            tensor6, concat=True)[0],
                                    columns=list('abcdefghij'))
        pd.testing.assert_frame_equal(pdf_expected, result6)

        # from 1d tensors
        raws8 = [('a', np.random.rand(8)), ('b', np.random.randint(10,
                                                                   size=8)),
                 ('c', [
                     ''.join(np.random.choice(list(printable), size=6))
                     for _ in range(8)
                 ])]
        tensors8 = OrderedDict(
            (r[0], mt.tensor(r[1], chunk_size=3)) for r in raws8)
        raws8.append(('d', 1))
        raws8.append(('e', pd.date_range('2020-1-1', periods=8)))
        tensors8['d'] = 1
        tensors8['e'] = raws8[-1][1]
        df8 = dataframe_from_1d_tileables(tensors8,
                                          columns=[r[0] for r in raws8])
        result = self.executor.execute_dataframe(df8, concat=True)[0]
        pdf_expected = pd.DataFrame(OrderedDict(raws8))
        pd.testing.assert_frame_equal(result, pdf_expected)

        # from 1d tensors and specify index with a tensor
        index_raw9 = np.random.rand(8)
        index9 = mt.tensor(index_raw9, chunk_size=4)
        df9 = dataframe_from_1d_tileables(tensors8,
                                          columns=[r[0] for r in raws8],
                                          index=index9)
        result = self.executor.execute_dataframe(df9, concat=True)[0]
        pdf_expected = pd.DataFrame(OrderedDict(raws8), index=index_raw9)
        pd.testing.assert_frame_equal(result, pdf_expected)

        # from 1d tensors and specify index
        df11 = dataframe_from_1d_tileables(tensors8,
                                           columns=[r[0] for r in raws8],
                                           index=md.date_range('2020-1-1',
                                                               periods=8))
        result = self.executor.execute_dataframe(df11, concat=True)[0]
        pdf_expected = pd.DataFrame(OrderedDict(raws8),
                                    index=pd.date_range('2020-1-1', periods=8))
        pd.testing.assert_frame_equal(result, pdf_expected)
예제 #19
0
class TestAppend:
    def test_append(self, sort, float_frame):
        mixed_frame = float_frame.copy()
        mixed_frame["foo"] = "bar"

        begin_index = float_frame.index[:5]
        end_index = float_frame.index[5:]

        begin_frame = float_frame.reindex(begin_index)
        end_frame = float_frame.reindex(end_index)

        appended = begin_frame.append(end_frame)
        tm.assert_almost_equal(appended["A"], float_frame["A"])

        del end_frame["A"]
        partial_appended = begin_frame.append(end_frame, sort=sort)
        assert "A" in partial_appended

        partial_appended = end_frame.append(begin_frame, sort=sort)
        assert "A" in partial_appended

        # mixed type handling
        appended = mixed_frame[:5].append(mixed_frame[5:])
        tm.assert_frame_equal(appended, mixed_frame)

        # what to test here
        mixed_appended = mixed_frame[:5].append(float_frame[5:], sort=sort)
        mixed_appended2 = float_frame[:5].append(mixed_frame[5:], sort=sort)

        # all equal except 'foo' column
        tm.assert_frame_equal(
            mixed_appended.reindex(columns=["A", "B", "C", "D"]),
            mixed_appended2.reindex(columns=["A", "B", "C", "D"]),
        )

    def test_append_empty(self, float_frame):
        empty = DataFrame()

        appended = float_frame.append(empty)
        tm.assert_frame_equal(float_frame, appended)
        assert appended is not float_frame

        appended = empty.append(float_frame)
        tm.assert_frame_equal(float_frame, appended)
        assert appended is not float_frame

    def test_append_overlap_raises(self, float_frame):
        msg = "Indexes have overlapping values"
        with pytest.raises(ValueError, match=msg):
            float_frame.append(float_frame, verify_integrity=True)

    def test_append_new_columns(self):
        # see gh-6129: new columns
        df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}})
        row = Series([5, 6, 7], index=["a", "b", "c"], name="z")
        expected = DataFrame({
            "a": {
                "x": 1,
                "y": 2,
                "z": 5
            },
            "b": {
                "x": 3,
                "y": 4,
                "z": 6
            },
            "c": {
                "z": 7
            },
        })
        result = df.append(row)
        tm.assert_frame_equal(result, expected)

    def test_append_length0_frame(self, sort):
        df = DataFrame(columns=["A", "B", "C"])
        df3 = DataFrame(index=[0, 1], columns=["A", "B"])
        df5 = df.append(df3, sort=sort)

        expected = DataFrame(index=[0, 1], columns=["A", "B", "C"])
        tm.assert_frame_equal(df5, expected)

    def test_append_records(self):
        arr1 = np.zeros((2, ), dtype=("i4,f4,a10"))
        arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]

        arr2 = np.zeros((3, ), dtype=("i4,f4,a10"))
        arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")]

        df1 = DataFrame(arr1)
        df2 = DataFrame(arr2)

        result = df1.append(df2, ignore_index=True)
        expected = DataFrame(np.concatenate((arr1, arr2)))
        tm.assert_frame_equal(result, expected)

    # rewrite sort fixture, since we also want to test default of None
    def test_append_sorts(self, sort):
        df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
        df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3])

        with tm.assert_produces_warning(None):
            result = df1.append(df2, sort=sort)

        # for None / True
        expected = DataFrame(
            {
                "b": [1, 2, None, None],
                "a": [1, 2, 1, 2],
                "c": [None, None, 3, 4]
            },
            columns=["a", "b", "c"],
        )
        if sort is False:
            expected = expected[["b", "a", "c"]]
        tm.assert_frame_equal(result, expected)

    def test_append_different_columns(self, sort):
        df = DataFrame({
            "bools": np.random.randn(10) > 0,
            "ints": np.random.randint(0, 10, 10),
            "floats": np.random.randn(10),
            "strings": ["foo", "bar"] * 5,
        })

        a = df[:5].loc[:, ["bools", "ints", "floats"]]
        b = df[5:].loc[:, ["strings", "ints", "floats"]]

        appended = a.append(b, sort=sort)
        assert isna(appended["strings"][0:4]).all()
        assert isna(appended["bools"][5:]).all()

    def test_append_many(self, sort, float_frame):
        chunks = [
            float_frame[:5],
            float_frame[5:10],
            float_frame[10:15],
            float_frame[15:],
        ]

        result = chunks[0].append(chunks[1:])
        tm.assert_frame_equal(result, float_frame)

        chunks[-1] = chunks[-1].copy()
        chunks[-1]["foo"] = "bar"
        result = chunks[0].append(chunks[1:], sort=sort)
        tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame)
        assert (result["foo"][15:] == "bar").all()
        assert result["foo"][:15].isna().all()

    def test_append_preserve_index_name(self):
        # #980
        df1 = DataFrame(columns=["A", "B", "C"])
        df1 = df1.set_index(["A"])
        df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]],
                        columns=["A", "B", "C"])
        df2 = df2.set_index(["A"])

        result = df1.append(df2)
        assert result.index.name == "A"

    indexes_can_append = [
        pd.RangeIndex(3),
        Index([4, 5, 6]),
        Index([4.5, 5.5, 6.5]),
        Index(list("abc")),
        pd.CategoricalIndex("A B C".split()),
        pd.CategoricalIndex("D E F".split(), ordered=True),
        pd.IntervalIndex.from_breaks([7, 8, 9, 10]),
        pd.DatetimeIndex([
            dt.datetime(2013, 1, 3, 0, 0),
            dt.datetime(2013, 1, 3, 6, 10),
            dt.datetime(2013, 1, 3, 7, 12),
        ]),
        pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]),
    ]

    @pytest.mark.parametrize("index",
                             indexes_can_append,
                             ids=lambda x: type(x).__name__)
    def test_append_same_columns_type(self, index):
        # GH18359

        # df wider than ser
        df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
        ser_index = index[:2]
        ser = Series([7, 8], index=ser_index, name=2)
        result = df.append(ser)
        expected = DataFrame([[1, 2, 3.0], [4, 5, 6], [7, 8, np.nan]],
                             index=[0, 1, 2],
                             columns=index)
        # integer dtype is preserved for columns present in ser.index
        assert expected.dtypes.iloc[0].kind == "i"
        assert expected.dtypes.iloc[1].kind == "i"

        tm.assert_frame_equal(result, expected)

        # ser wider than df
        ser_index = index
        index = index[:2]
        df = DataFrame([[1, 2], [4, 5]], columns=index)
        ser = Series([7, 8, 9], index=ser_index, name=2)
        result = df.append(ser)
        expected = DataFrame(
            [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
            index=[0, 1, 2],
            columns=ser_index,
        )
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "df_columns, series_index",
        combinations(indexes_can_append, r=2),
        ids=lambda x: type(x).__name__,
    )
    def test_append_different_columns_types(self, df_columns, series_index):
        # GH18359
        # See also test 'test_append_different_columns_types_raises' below
        # for errors raised when appending

        df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
        ser = Series([7, 8, 9], index=series_index, name=2)

        result = df.append(ser)
        idx_diff = ser.index.difference(df_columns)
        combined_columns = Index(df_columns.tolist()).append(idx_diff)
        expected = DataFrame(
            [
                [1.0, 2.0, 3.0, np.nan, np.nan, np.nan],
                [4, 5, 6, np.nan, np.nan, np.nan],
                [np.nan, np.nan, np.nan, 7, 8, 9],
            ],
            index=[0, 1, 2],
            columns=combined_columns,
        )
        tm.assert_frame_equal(result, expected)

    def test_append_dtype_coerce(self, sort):

        # GH 4993
        # appending with datetime will incorrectly convert datetime64

        df1 = DataFrame(
            index=[1, 2],
            data=[
                dt.datetime(2013, 1, 1, 0, 0),
                dt.datetime(2013, 1, 2, 0, 0)
            ],
            columns=["start_time"],
        )
        df2 = DataFrame(
            index=[4, 5],
            data=[
                [
                    dt.datetime(2013, 1, 3, 0, 0),
                    dt.datetime(2013, 1, 3, 6, 10)
                ],
                [
                    dt.datetime(2013, 1, 4, 0, 0),
                    dt.datetime(2013, 1, 4, 7, 10)
                ],
            ],
            columns=["start_time", "end_time"],
        )

        expected = concat(
            [
                Series(
                    [
                        pd.NaT,
                        pd.NaT,
                        dt.datetime(2013, 1, 3, 6, 10),
                        dt.datetime(2013, 1, 4, 7, 10),
                    ],
                    name="end_time",
                ),
                Series(
                    [
                        dt.datetime(2013, 1, 1, 0, 0),
                        dt.datetime(2013, 1, 2, 0, 0),
                        dt.datetime(2013, 1, 3, 0, 0),
                        dt.datetime(2013, 1, 4, 0, 0),
                    ],
                    name="start_time",
                ),
            ],
            axis=1,
            sort=sort,
        )
        result = df1.append(df2, ignore_index=True, sort=sort)
        if sort:
            expected = expected[["end_time", "start_time"]]
        else:
            expected = expected[["start_time", "end_time"]]

        tm.assert_frame_equal(result, expected)

    def test_append_missing_column_proper_upcast(self, sort):
        df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")})
        df2 = DataFrame(
            {"B": np.array([True, False, True, False], dtype=bool)})

        appended = df1.append(df2, ignore_index=True, sort=sort)
        assert appended["A"].dtype == "f8"
        assert appended["B"].dtype == "O"

    def test_append_empty_frame_to_series_with_dateutil_tz(self):
        # GH 23682
        date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
        ser = Series({"a": 1.0, "b": 2.0, "date": date})
        df = DataFrame(columns=["c", "d"])
        result_a = df.append(ser, ignore_index=True)
        expected = DataFrame([[np.nan, np.nan, 1.0, 2.0, date]],
                             columns=["c", "d", "a", "b", "date"])
        # These columns get cast to object after append
        expected["c"] = expected["c"].astype(object)
        expected["d"] = expected["d"].astype(object)
        tm.assert_frame_equal(result_a, expected)

        expected = DataFrame([[np.nan, np.nan, 1.0, 2.0, date]] * 2,
                             columns=["c", "d", "a", "b", "date"])
        expected["c"] = expected["c"].astype(object)
        expected["d"] = expected["d"].astype(object)
        result_b = result_a.append(ser, ignore_index=True)
        tm.assert_frame_equal(result_b, expected)

        result = df.append([ser, ser], ignore_index=True)
        tm.assert_frame_equal(result, expected)

    def test_append_empty_tz_frame_with_datetime64ns(self):
        # https://github.com/pandas-dev/pandas/issues/35460
        df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")

        # pd.NaT gets inferred as tz-naive, so append result is tz-naive
        result = df.append({"a": pd.NaT}, ignore_index=True)
        expected = DataFrame({"a": [pd.NaT]}).astype(object)
        tm.assert_frame_equal(result, expected)

        # also test with typed value to append
        df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
        other = Series({"a": pd.NaT}, dtype="datetime64[ns]")
        result = df.append(other, ignore_index=True)
        expected = DataFrame({"a": [pd.NaT]}).astype(object)
        tm.assert_frame_equal(result, expected)

        # mismatched tz
        other = Series({"a": pd.NaT}, dtype="datetime64[ns, US/Pacific]")
        result = df.append(other, ignore_index=True)
        expected = DataFrame({"a": [pd.NaT]}).astype(object)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype_str",
        ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"])
    @pytest.mark.parametrize("val", [1, "NaT"])
    def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val):
        # https://github.com/pandas-dev/pandas/issues/35460
        df = DataFrame(columns=["a"]).astype(dtype_str)

        other = DataFrame({"a": [np.timedelta64(val, "ns")]})
        result = df.append(other, ignore_index=True)

        expected = other.astype(object)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype_str",
        ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"])
    @pytest.mark.parametrize("val", [1, "NaT"])
    def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
        # https://github.com/pandas-dev/pandas/issues/35460
        df = DataFrame({"a": pd.array([1], dtype=dtype_str)})

        other = DataFrame({"a": [np.timedelta64(val, "ns")]})
        result = df.append(other, ignore_index=True)

        expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]},
                             dtype=object)
        tm.assert_frame_equal(result, expected)
예제 #20
0
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pandas import ExcelWriter
import xlrd
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram

df = pd.read_excel('C:/Users/user/Desktop/price all 2000x2000.xlsx')
df = df[df.날짜 <= '2018.09.01']
df = df[df.날짜 >= '2018.08.01']
df.index = pd.RangeIndex(len(df.index))
df = df.drop("날짜", 1)

writer = ExcelWriter('all_price.xlsx')
df.to_excel(writer, 'Sheet1')
writer.save()

corr = df.corr(method='pearson')
writer = ExcelWriter('corr.xlsx')
corr.to_excel(writer, 'Sheet1')
writer.save()
예제 #21
0
파일: utils.py 프로젝트: Shkuin/CyberWorld
def _nonempty_index(idx):
    typ = type(idx)
    if typ is pd.RangeIndex:
        return pd.RangeIndex(2, name=idx.name)
    elif typ in _numeric_index_types:
        return typ([1, 2], name=idx.name)
    elif typ is pd.Index:
        return pd.Index(["a", "b"], name=idx.name)
    elif typ is pd.DatetimeIndex:
        start = "1970-01-01"
        # Need a non-monotonic decreasing index to avoid issues with
        # partial string indexing see https://github.com/dask/dask/issues/2389
        # and https://github.com/pandas-dev/pandas/issues/16515
        # This doesn't mean `_meta_nonempty` should ever rely on
        # `self.monotonic_increasing` or `self.monotonic_decreasing`
        try:
            return pd.date_range(start=start,
                                 periods=2,
                                 freq=idx.freq,
                                 tz=idx.tz,
                                 name=idx.name)
        except ValueError:  # older pandas versions
            data = [start, "1970-01-02"] if idx.freq is None else None
            return pd.DatetimeIndex(data,
                                    start=start,
                                    periods=2,
                                    freq=idx.freq,
                                    tz=idx.tz,
                                    name=idx.name)
    elif typ is pd.PeriodIndex:
        return pd.period_range(start="1970-01-01",
                               periods=2,
                               freq=idx.freq,
                               name=idx.name)
    elif typ is pd.TimedeltaIndex:
        start = np.timedelta64(1, "D")
        try:
            return pd.timedelta_range(start=start,
                                      periods=2,
                                      freq=idx.freq,
                                      name=idx.name)
        except ValueError:  # older pandas versions
            start = np.timedelta64(1, "D")
            data = [start, start + 1] if idx.freq is None else None
            return pd.TimedeltaIndex(data,
                                     start=start,
                                     periods=2,
                                     freq=idx.freq,
                                     name=idx.name)
    elif typ is pd.CategoricalIndex:
        if len(idx.categories) == 0:
            data = pd.Categorical(_nonempty_index(idx.categories),
                                  ordered=idx.ordered)
        else:
            data = pd.Categorical.from_codes([-1, 0],
                                             categories=idx.categories,
                                             ordered=idx.ordered)
        return pd.CategoricalIndex(data, name=idx.name)
    elif typ is pd.MultiIndex:
        levels = [_nonempty_index(l) for l in idx.levels]
        codes = [[0, 0] for i in idx.levels]
        try:
            return pd.MultiIndex(levels=levels, codes=codes, names=idx.names)
        except TypeError:  # older pandas versions
            return pd.MultiIndex(levels=levels, labels=codes, names=idx.names)

    raise TypeError("Don't know how to handle index of type {0}".format(
        typename(type(idx))))
예제 #22
0
파일: out.py 프로젝트: arsalan-akhtar/sisl
    def read_charge(self,
                    name,
                    iscf=Opt.ANY,
                    imd=Opt.ANY,
                    key_scf="scf",
                    as_dataframe=False):
        r"""Read charges calculated in SCF loop or MD loop (or both)

        Siesta enables many different modes of writing out charges.

        NOTE: currently Mulliken charges are not implemented.

        The below table shows a list of different cases that
        may be encountered, the letters are referred to in the
        return section to indicate what is returned.

        +-----------+-----+-----+--------+-------+------------------+
        | Case      | *A* | *B* | *C*    | *D*   | *E*              |
        +-----------+-----+-----+--------+-------+------------------+
        | Charge    | MD  | SCF | MD+SCF | Final | Orbital resolved |
        +-----------+-----+-----+--------+-------+------------------+
        | Voronoi   | +   | +   | +      | +     | -                |
        +-----------+-----+-----+--------+-------+------------------+
        | Hirshfeld | +   | +   | +      | +     | -                |
        +-----------+-----+-----+--------+-------+------------------+
        | Mulliken  | +   | +   | +      | +     | +                |
        +-----------+-----+-----+--------+-------+------------------+

        Notes
        -----
        Errors will be raised if one requests information not present. I.e.
        passing an integer or `Opt.ALL` for `iscf` will raise an error if
        the SCF charges are not present. For `Opt.ANY` it will return
        the most information, effectively SCF will be returned if present.

        Currently Mulliken is not implemented, any help in reading this would be
        very welcome.

        Parameters
        ----------
        name: {"voronoi", "hirshfeld"}
            the name of the charges that you want to read
        iscf: int or Opt, optional
            index (0-based) of the scf iteration you want the charges for.
            If the enum specifier `Opt.ANY` or `Opt.ALL` are used, then
            the returned quantities depend on what is present.
            If ``None/Opt.NONE`` it will not return any SCF charges.
            If both `imd` and `iscf` are ``None`` then only the final charges will be returned.
        imd: int or Opt, optional
            index (0-based) of the md step you want the charges for.
            If the enum specifier `Opt.ANY` or `Opt.ALL` are used, then
            the returned quantities depend on what is present.
            If ``None/Opt.NONE`` it will not return any MD charges.
            If both `imd` and `iscf` are ``None`` then only the final charges will be returned.
        key_scf : str, optional
            the key lookup for the scf iterations (a ":" will automatically be appended)
        as_dataframe: boolean, optional
            whether charges should be returned as a pandas dataframe.

        Returns
        -------
        numpy.ndarray
            if a specific MD+SCF index is requested (or special cases where output is
            not complete)
        list of numpy.ndarray
            if one both `iscf` or `imd` is different from ``None/Opt.NONE``.
        pandas.DataFrame
            if `as_dataframe` is requested. The dataframe will have multi-indices if multiple
            SCF or MD steps are requested.
        """
        if not hasattr(self, 'fh'):
            with self:
                return read_charge(self, name, iscf, imd, key_scf,
                                   as_dataframe)
        namel = name.lower()
        if as_dataframe:
            import pandas as pd

            def _empty_charge():
                # build a fake dataframe with no indices
                return pd.DataFrame(index=pd.Index([],
                                                   name="atom",
                                                   dtype=np.int32),
                                    dtype=np.float32)
        else:
            pd = None

            def _empty_charge():
                # return for single value with nan values
                return _a.arrayf([[None]])

        # define helper function for reading voronoi+hirshfeld charges
        def _voronoi_hirshfeld_charges():
            """ Read output from Voronoi/Hirshfeld charges """
            nonlocal pd

            # Expecting something like this:
            # Voronoi Atomic Populations:
            # Atom #     dQatom  Atom pop         S        Sx        Sy        Sz  Species
            #      1   -0.02936   4.02936   0.00000  -0.00000   0.00000   0.00000  C

            # Define the function that parses the charges
            def _parse_charge(line):
                atom_idx, *vals, symbol = line.split()
                # assert that this is a proper line
                # this should catch cases where the following line of charge output
                # is still parseable
                atom_idx = int(atom_idx)
                return list(map(float, vals))

            # first line is the header
            header = (
                self.readline().replace("dQatom", "dq")  # dQatom in master
                .replace(" Qatom", " dq")  # Qatom in 4.1
                .replace("Atom pop", "e")  # not found in 4.1
                .split())[2:-1]

            # We have found the header, prepare a list to read the charges
            atom_charges = []
            line = ' '
            while line != "":
                try:
                    line = self.readline()
                    charge_vals = _parse_charge(line)
                    atom_charges.append(charge_vals)
                except:
                    # We already have the charge values and we reached a line that can't be parsed,
                    # this means we have reached the end.
                    break
            if pd is None:
                # not as_dataframe
                return _a.arrayf(atom_charges)

            # determine how many columns we have
            # this will remove atom indices and species, so only inside
            ncols = len(atom_charges[0])
            assert ncols == len(header)

            # the precision is limited, so no need for double precision
            return pd.DataFrame(atom_charges,
                                columns=header,
                                dtype=np.float32,
                                index=pd.RangeIndex(stop=len(atom_charges),
                                                    name="atom"))

        # define helper function for reading voronoi+hirshfeld charges
        def _mulliken_charges():
            """ Read output from Mulliken charges """
            raise NotImplementedError(
                "Mulliken charges are not implemented currently")

        # Check that a known charge has been requested
        if namel == "voronoi":
            _r_charge = _voronoi_hirshfeld_charges
            charge_keys = [
                "Voronoi Atomic Populations", "Voronoi Net Atomic Populations"
            ]
        elif namel == "hirshfeld":
            _r_charge = _voronoi_hirshfeld_charges
            charge_keys = [
                "Hirshfeld Atomic Populations",
                "Hirshfeld Net Atomic Populations"
            ]
        elif namel == "mulliken":
            _r_charge = _mulliken_charges
            charge_keys = ["mulliken: Atomic and Orbital Populations"]
        else:
            raise ValueError(
                f"{self.__class__.__name__}.read_charge name argument should be one of {known_charges}, got {name}?"
            )

        # Ensure the key_scf matches exactly (prepend a space)
        key_scf = f" {key_scf.strip()}:"

        # Reading charges may be quite time consuming for large MD simulations.

        # to see if we finished a MD read, we check for these keys
        search_keys = [
            # two keys can signal ending SCF
            "SCF Convergence",
            "SCF_NOT_CONV",
            "siesta: Final energy",
            key_scf,
            *charge_keys
        ]
        # adjust the below while loop to take into account any additional
        # segments of search_keys
        IDX_SCF_END = [0, 1]
        IDX_FINAL = [2]
        IDX_SCF = [3]
        # the rest are charge keys
        IDX_CHARGE = list(
            range(len(search_keys) - len(charge_keys), len(search_keys)))

        # state to figure out where we are
        state = PropertyDict()
        state.INITIAL = 0
        state.MD = 1
        state.SCF = 2
        state.CHARGE = 3
        state.FINAL = 4

        # a list of scf_charge
        md_charge = []
        md_scf_charge = []
        scf_charge = []
        final_charge = None

        # signal that any first reads are INITIAL charges
        current_state = state.INITIAL
        charge = _empty_charge()
        FOUND_SCF = False
        FOUND_MD = False
        FOUND_FINAL = False

        # TODO whalrus
        ret = self.step_to(search_keys,
                           case=True,
                           ret_index=True,
                           reread=False)
        while ret[0]:
            if ret[2] in IDX_SCF_END:
                # we finished all SCF iterations
                current_state = state.MD
                md_scf_charge.append(scf_charge)
                scf_charge = []

            elif ret[2] in IDX_SCF:
                current_state = state.SCF
                # collect scf-charges (possibly none)
                scf_charge.append(charge)

            elif ret[2] in IDX_FINAL:
                current_state = state.FINAL
                # don't do anything, this is the final charge construct
                # regardless of where it comes from.

            elif ret[2] in IDX_CHARGE:
                FOUND_CHARGE = True
                # also read charge
                charge = _r_charge()

                if state.INITIAL == current_state or state.CHARGE == current_state:
                    # this signals scf charges
                    FOUND_SCF = True
                    # There *could* be 2 steps if we are mixing H,
                    # this is because it first does
                    # compute H -> compute DM -> compute H
                    # in the first iteration, subsequently we only do
                    # compute compute DM -> compute H
                    # once we hit ret[2] in IDX_SCF we will append
                    scf_charge = []

                elif state.MD == current_state:
                    FOUND_MD = True
                    # we just finished an SCF cycle.
                    # So any output between SCF ending and
                    # a new one beginning *must* be that geometries
                    # charge

                    # Here `charge` may be NONE signalling
                    # we don't have charge in MD steps.
                    md_charge.append(charge)

                    # reset charge
                    charge = _empty_charge()

                elif state.SCF == current_state:
                    FOUND_SCF = True

                elif state.FINAL == current_state:
                    FOUND_FINAL = True
                    # a special state writing out the charges after everything
                    final_charge = charge
                    charge = _empty_charge()
                    scf_charge = []
                    # we should be done and no other charge reads should be found!
                    # should we just break?

                current_state = state.CHARGE

            # step to next entry
            ret = self.step_to(search_keys,
                               case=True,
                               ret_index=True,
                               reread=False)

        if not any((FOUND_SCF, FOUND_MD, FOUND_FINAL)):
            raise SileError(
                f"{str(self)} does not contain any charges ({name})")

        # if the scf-charges are not stored, it means that the MD step finalization
        # has not been read. So correct
        if len(scf_charge) > 0:
            assert False, "this test shouldn't reach here"
            # we must not have read through the entire MD step
            # so this has to be a running simulation
            if charge is not None:
                scf_charge.append(charge)
                charge = _empty_charge()
            md_scf_charge.append(scf_charge)

        # otherwise there is some *parsing* error, so for now we use assert
        assert len(scf_charge) == 0

        if as_dataframe:
            # convert data to proper data structures
            # regardless of user requests. This is an overhead... But probably not that big of a problem.
            if FOUND_SCF:
                md_scf_charge = pd.concat([
                    pd.concat(iscf,
                              keys=pd.RangeIndex(1, len(iscf) + 1,
                                                 name="iscf"))
                    for iscf in md_scf_charge
                ],
                                          keys=pd.RangeIndex(
                                              1,
                                              len(md_scf_charge) + 1,
                                              name="imd"))
            if FOUND_MD:
                md_charge = pd.concat(md_charge,
                                      keys=pd.RangeIndex(1,
                                                         len(md_charge) + 1,
                                                         name="imd"))
        else:
            if FOUND_SCF:
                nan_array = _a.emptyf(md_scf_charge[0][0].shape)
                nan_array.fill(np.nan)

                def get_md_scf_charge(scf_charge, iscf):
                    try:
                        return scf_charge[iscf]
                    except:
                        return nan_array

            if FOUND_MD:
                md_charge = np.stack(md_charge)

        # option parsing is a bit *difficult* with flag enums
        # So first figure out what is there, and handle this based
        # on arguments
        def _p(flag, found):
            """ Helper routine to do the following:

            Returns
            -------
            is_opt : bool
                whether the flag is an `Opt`
            flag :
                corrected flag
            """
            if isinstance(flag, Opt):
                # correct flag depending on what `found` is
                # If the values have been found we
                # change flag to None only if flag == NONE
                # If the case has not been found, we
                # change flag to None if ANY or NONE is in flags

                if found:
                    # flag is only NONE, then pass none
                    if not (Opt.NONE ^ flag):
                        flag = None
                else:  # not found
                    # we convert flag to none
                    # if ANY or NONE in flag
                    if (Opt.NONE | Opt.ANY) & flag:
                        flag = None

            return isinstance(flag, Opt), flag

        opt_imd, imd = _p(imd, FOUND_MD)
        opt_iscf, iscf = _p(iscf, FOUND_SCF)

        if not (FOUND_SCF or FOUND_MD):
            # none of these are found
            # we request that user does not request any input
            if (opt_iscf or (not iscf is None)) or \
               (opt_imd or (not imd is None)):
                raise SileError(f"{str(self)} does not contain MD/SCF charges")

        elif not FOUND_SCF:
            if opt_iscf or (not iscf is None):
                raise SileError(f"{str(self)} does not contain SCF charges")

        elif not FOUND_MD:
            if opt_imd or (not imd is None):
                raise SileError(f"{str(self)} does not contain MD charges")

        # if either are options they may hold
        if opt_imd and opt_iscf:
            if FOUND_SCF:
                return md_scf_charge
            elif FOUND_MD:
                return md_charge
            elif FOUND_FINAL:
                # I think this will never be reached
                # If neither are found they will be converted to
                # None
                return final_charge

            raise SileError(
                f"{str(self)} unknown argument for 'imd' and 'iscf'")

        elif opt_imd:
            # flag requested imd
            if not (imd & (Opt.ANY | Opt.ALL)):
                # wrong flag
                raise SileError(f"{str(self)} unknown argument for 'imd'")

            if FOUND_SCF and iscf is not None:
                # this should be handled, i.e. the scf should be taken out
                if as_dataframe:
                    return md_scf_charge.groupby(level=[0, 2]).nth(iscf)
                return np.stack(
                    tuple(get_md_scf_charge(x, iscf) for x in md_scf_charge))

            elif FOUND_MD and iscf is None:
                return md_charge
            raise SileError(
                f"{str(self)} unknown argument for 'imd' and 'iscf', could not find SCF charges"
            )

        elif opt_iscf:
            # flag requested imd
            if not (iscf & (Opt.ANY | Opt.ALL)):
                # wrong flag
                raise SileError(f"{str(self)} unknown argument for 'iscf'")
            if imd is None:
                # correct imd
                imd = -1
            if as_dataframe:
                md_scf_charge = md_scf_charge.groupby(level=0)
                group = list(md_scf_charge.groups.keys())[imd]
                return md_scf_charge.get_group(group).droplevel(0)
            return np.stack(md_scf_charge[imd])

        elif imd is None and iscf is None:
            if FOUND_FINAL:
                return final_charge
            raise SileError(f"{str(self)} does not contain final charges")

        elif imd is None:
            # iscf is not None, so pass through as though explicitly passed
            imd = -1

        elif iscf is None:
            # we return the last MD step and the requested scf iteration
            if as_dataframe:
                return md_charge.groupby(level=1).nth(imd)
            return md_charge[imd]

        if as_dataframe:
            # first select imd
            md_scf_charge = md_scf_charge.groupby(level=0)
            group = list(md_scf_charge.groups.keys())[imd]
            md_scf_charge = md_scf_charge.get_group(group).droplevel(0)
            return md_scf_charge.groupby(level=1).nth(iscf)
        return md_scf_charge[imd][iscf]
예제 #23
0
#plt.show()

#SINEWAVE plot example Prediction vs Actual Time series data
for i in np.arange(0, sine_test_x.shape[0], 500):
    series_plot_input = pd.DataFrame({'Input': sine_test_x[i, :, 0]})
    series_plot_predicted = pd.DataFrame({'Predicted': data_predicted[i, :]})
    series_plot_actual = pd.DataFrame({'Actual': data_actual[i, :]})
    nan = pd.DataFrame(np.nan, index=np.arange(0, n_out), columns=['A'])

    concat1 = pd.concat([series_plot_input, nan])
    concat2 = pd.concat([nan, series_plot_predicted])
    concat3 = pd.concat([nan, series_plot_actual])
    series_plot = pd.concat([concat1, concat2, concat3], axis=1)
    series_plot = series_plot.loc[:, ~series_plot.columns.get_loc("A")]

    series_plot.index = pd.RangeIndex(len(series_plot.index))

    dpi = 300
    multiplier = 1e1
    ticks = 1e-1
    x_tick = int(math.floor(n_out / 10) * 10 / 10)
    fig, ax = plt.subplots(figsize=(3000 / dpi, 3500 / dpi), dpi=dpi)
    sns.set_context("talk")
    plot = sns.lineplot(data=series_plot,
                        legend='full',
                        alpha=1,
                        palette="muted",
                        dashes=False)
    plot.set(xlabel='Time (Index)', ylabel='Voltage', title='iEEG Values')
    #plot.set(xlim=(-1,n_out*2), xticks=range(0,n_out*2,x_tick), yticks=np.arange(math.floor(plot.axes[0,0].get_ylim()[0] * multiplier) / multiplier,math.ceil(plot.axes[0,0].get_ylim()[1] * multiplier) / multiplier, ticks))
    plot.set_xticklabels(range(0, n_out * 2, x_tick))
예제 #24
0
df = country_iso_df[country_iso_df['date'] == date_list[
    date_slider]]  #create dataframe from user-selected date
date_formatted = dt.datetime.strftime(datetime_list[date_slider],
                                      '%m/%d/%Y')  #format date
st.write(f"Global COVID-19 {dataset_type} cases as of date: ",
         date_formatted)  #display formatted date
st.plotly_chart(generate_map(df))  #plot on map

#Prints the relevant dataframe in streamlit
st.subheader(
    f"Top 10 countries for {dataset_type} cases as of date: {date_formatted}")
df2 = df[['country', 'count']]  #gets user-defined dataframe
df2 = df2.sort_values(
    by='count', ascending=False
)[:10]  #sorts dataframe by parameter, e.g. positive test and gets top 10
df2.index = pd.RangeIndex(1, 11)
st.table(df2)  #prints table

# '''
# Note: the subsequent code was written based on formatting of archived JHU data sets, which originally contained US
# state data. The archived data can be found at the following link:
# https://github.com/CSSEGISandData/COVID-19/tree/master/archived_data/archived_time_series.
# '''

# #Select state data
# st.subheader('Explore data by US state over time')
# states_url = "https://secure.ssa.gov/apps10/poms.nsf/lnx/0901501010"
# state_dfs = pd.read_html(states_url, header=0)[0]
# state_dfs.to_sql('states', conn, index_label='id', if_exists='replace') #create table of state abbreviations

# #Function to return US data by state (aggregated) or by county for each state
예제 #25
0
def test_merge(setup):
    df1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e'])
    df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y'])
    df3 = df1.copy()
    df3.index = pd.RangeIndex(2, 6, name='index')
    df4 = df1.copy()
    df4.index = pd.MultiIndex.from_tuples([(i, i + 1) for i in range(4)], names=['i1', 'i2'])

    mdf1 = from_pandas(df1, chunk_size=2)
    mdf2 = from_pandas(df2, chunk_size=2)
    mdf3 = from_pandas(df3, chunk_size=3)
    mdf4 = from_pandas(df4, chunk_size=2)

    # Note [Index of Merge]
    #
    # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to
    # the final result dataframe.
    #
    # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex.
    # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the
    # same index value with pandas. But we guarantee that the content of dataframe is correct.

    # merge on index
    expected0 = df1.merge(df2)
    jdf0 = mdf1.merge(mdf2)
    result0 = jdf0.execute().fetch()
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0))

    # merge on left index and `right_on`
    expected1 = df1.merge(df2, how='left', right_on='x', left_index=True)
    jdf1 = mdf1.merge(mdf2, how='left', right_on='x', left_index=True)
    result1 = jdf1.execute().fetch()
    expected1.set_index('a_x', inplace=True)
    result1.set_index('a_x', inplace=True)
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1, 0), sort_dataframe_inplace(result1, 0))

    # merge on `left_on` and right index
    expected2 = df1.merge(df2, how='right', left_on='a', right_index=True)
    jdf2 = mdf1.merge(mdf2, how='right', left_on='a', right_index=True)
    result2 = jdf2.execute().fetch()
    expected2.set_index('a', inplace=True)
    result2.set_index('a', inplace=True)
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0))

    # merge on `left_on` and `right_on`
    expected3 = df1.merge(df2, how='left', left_on='a', right_on='x')
    jdf3 = mdf1.merge(mdf2, how='left', left_on='a', right_on='x')
    result3 = jdf3.execute().fetch()
    expected3.set_index('a_x', inplace=True)
    result3.set_index('a_x', inplace=True)
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0))

    # merge on `on`
    expected4 = df1.merge(df2, how='right', on='a')
    jdf4 = mdf1.merge(mdf2, how='right', on='a')
    result4 = jdf4.execute().fetch()
    expected4.set_index('a', inplace=True)
    result4.set_index('a', inplace=True)
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0))

    # merge on multiple columns
    expected5 = df1.merge(df2, how='inner', on=['a', 'b'])
    jdf5 = mdf1.merge(mdf2, how='inner', on=['a', 'b'])
    result5 = jdf5.execute().fetch()
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected5, 0), sort_dataframe_inplace(result5, 0))

    # merge when some on is index
    expected6 = df3.merge(df2, how='inner', left_on='index', right_on='a')
    jdf6 = mdf3.merge(mdf2, how='inner', left_on='index', right_on='a')
    result6 = jdf6.execute().fetch()
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected6, 0), sort_dataframe_inplace(result6, 0))

    # merge when on is in MultiIndex
    expected7 = df4.merge(df2, how='inner', left_on='i1', right_on='a')
    jdf7 = mdf4.merge(mdf2, how='inner', left_on='i1', right_on='a')
    result7 = jdf7.execute().fetch()
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected7, 0), sort_dataframe_inplace(result7, 0))

    # merge when on is in MultiIndex, and on not in index
    expected8 = df4.merge(df2, how='inner', on=['a', 'b'])
    jdf8 = mdf4.merge(mdf2, how='inner', on=['a', 'b'])
    result8 = jdf8.execute().fetch()
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected8, 0), sort_dataframe_inplace(result8, 0))
예제 #26
0
    def item_get_elements(self, s, type, name, filters=None):
        if filters:
            # Convert filter elements to strings
            filters = {dim: as_str_list(ele) for dim, ele in filters.items()}

        try:
            # Retrieve the cached value with this exact set of filters
            return self.cache_get(s, type, name, filters)
        except KeyError:
            pass  # Cache miss

        try:
            # Retrieve a cached, unfiltered value of the same item
            unfiltered = self.cache_get(s, type, name, None)
        except KeyError:
            pass  # Cache miss
        else:
            # Success; filter and return
            return filtered(unfiltered, filters)

        # Failed to load item from cache

        # Retrieve the item
        item = self._get_item(s, type, name, load=True)
        idx_names = list(item.getIdxNames())
        idx_sets = list(item.getIdxSets())

        # Get list of elements, using filters if provided
        if filters is not None:
            jFilter = java.HashMap()

            for idx_name, values in filters.items():
                # Retrieve the elements of the index set as a list
                idx_set = idx_sets[idx_names.index(idx_name)]
                elements = self.item_get_elements(s, 'set', idx_set).tolist()

                # Filter for only included values and store
                filtered_elements = filter(lambda e: e in values, elements)
                jFilter.put(idx_name, to_jlist(filtered_elements))

            jList = item.getElements(jFilter)
        else:
            jList = item.getElements()

        if item.getDim() > 0:
            # Mapping set or multi-dimensional equation, parameter, or variable
            columns = copy(idx_names)

            # Prepare dtypes for index columns
            dtypes = {}
            for idx_name, idx_set in zip(columns, idx_sets):
                # NB using categoricals could be more memory-efficient, but
                #    requires adjustment of tests/documentation. See
                #    https://github.com/iiasa/ixmp/issues/228
                # dtypes[idx_name] = CategoricalDtype(
                #     self.item_get_elements(s, 'set', idx_set))
                dtypes[idx_name] = str

            # Prepare dtypes for additional columns
            if type == 'par':
                columns.extend(['value', 'unit'])
                dtypes['value'] = float
                # Same as above
                # dtypes['unit'] = CategoricalDtype(self.jobj.getUnitList())
                dtypes['unit'] = str
            elif type in ('equ', 'var'):
                columns.extend(['lvl', 'mrg'])
                dtypes.update({'lvl': float, 'mrg': float})
            # Prepare empty DataFrame
            result = pd.DataFrame(index=pd.RangeIndex(len(jList)),
                                  columns=columns)

            # Copy vectors from Java into DataFrame columns
            # NB [:] causes JPype to use a faster code path
            for i in range(len(idx_sets)):
                result.iloc[:, i] = item.getCol(i, jList)[:]
            if type == 'par':
                result.loc[:, 'value'] = item.getValues(jList)[:]
                result.loc[:, 'unit'] = item.getUnits(jList)[:]
            elif type in ('equ', 'var'):
                result.loc[:, 'lvl'] = item.getLevels(jList)[:]
                result.loc[:, 'mrg'] = item.getMarginals(jList)[:]

            # .loc assignment above modifies dtypes; set afterwards
            result = result.astype(dtypes)
        elif type == 'set':
            # Index sets
            # dtype=object is to silence a warning in pandas 1.0
            result = pd.Series(item.getCol(0, jList), dtype=object)
        elif type == 'par':
            # Scalar parameters
            result = dict(value=float(item.getScalarValue().floatValue()),
                          unit=str(item.getScalarUnit()))
        elif type in ('equ', 'var'):
            # Scalar equations and variables
            result = dict(lvl=float(item.getScalarLevel().floatValue()),
                          mrg=float(item.getScalarMarginal().floatValue()))

        # Store cache
        self.cache(s, type, name, filters, result)

        return result
예제 #27
0
        if row_['AwayTeam'] == 'Inter':
            df_odds['AwayTeam'][idx] = 'Inter Milan'
        if row_['AwayTeam'] == 'Milan':
            df_odds['AwayTeam'][idx] = 'AC Milan'
        if row_['AwayTeam'] == 'Siena':
            df_odds['AwayTeam'][idx] = 'Robur Siena S.S.D.'


# Suppress 'SettingWithCopyWarning' & .copy(deep=True)
pd.options.mode.chained_assignment = None
df_teams = pd.read_excel(DATA_FILE)
names_teams = set(list(df_teams['Home Team']))

df_odds = pd.read_csv(ODDS_FILE, error_bad_lines=False)
df_odds_ = df_odds.loc[:5326, :]
df_odds_.index = pd.RangeIndex(len(df_odds_.index))

names_odds = set(list(df_odds_['HomeTeam']))
names = names_odds.difference(names_teams)

edit_odds_df()

pre_df = df_odds.loc[:, :]

bet_odds = []
for index, row in df_teams.iterrows():
    date = row['Date']
    converted_date = convert_date()
    found = False
    for i, x in pre_df.iterrows():
        if converted_date == x['Date'] and row['Home Team'] == x[
예제 #28
0
class TestGrouping:
    def test_grouper_index_types(self):
        # related GH5375
        # groupby misbehaving when using a Floatlike index
        df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB"))
        for index in [
            tm.makeFloatIndex,
            tm.makeStringIndex,
            tm.makeUnicodeIndex,
            tm.makeIntIndex,
            tm.makeDateIndex,
            tm.makePeriodIndex,
        ]:

            df.index = index(len(df))
            df.groupby(list("abcde")).apply(lambda x: x)

            df.index = list(reversed(df.index.tolist()))
            df.groupby(list("abcde")).apply(lambda x: x)

    def test_grouper_multilevel_freq(self):

        # GH 7885
        # with level and freq specified in a pd.Grouper
        from datetime import date, timedelta

        d0 = date.today() - timedelta(days=14)
        dates = date_range(d0, date.today())
        date_index = pd.MultiIndex.from_product([dates, dates], names=["foo", "bar"])
        df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)

        # Check string level
        expected = (
            df.reset_index()
            .groupby([pd.Grouper(key="foo", freq="W"), pd.Grouper(key="bar", freq="W")])
            .sum()
        )
        # reset index changes columns dtype to object
        expected.columns = pd.Index([0], dtype="int64")

        result = df.groupby(
            [pd.Grouper(level="foo", freq="W"), pd.Grouper(level="bar", freq="W")]
        ).sum()
        tm.assert_frame_equal(result, expected)

        # Check integer level
        result = df.groupby(
            [pd.Grouper(level=0, freq="W"), pd.Grouper(level=1, freq="W")]
        ).sum()
        tm.assert_frame_equal(result, expected)

    def test_grouper_creation_bug(self):

        # GH 8795
        df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]})
        g = df.groupby("A")
        expected = g.sum()

        g = df.groupby(pd.Grouper(key="A"))
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        result = g.apply(lambda x: x.sum())
        tm.assert_frame_equal(result, expected)

        g = df.groupby(pd.Grouper(key="A", axis=0))
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # GH14334
        # pd.Grouper(key=...) may be passed in a list
        df = DataFrame(
            {"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]}
        )
        # Group by single column
        expected = df.groupby("A").sum()
        g = df.groupby([pd.Grouper(key="A")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group by two columns
        # using a combination of strings and Grouper objects
        expected = df.groupby(["A", "B"]).sum()

        # Group with two Grouper objects
        g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group with a string and a Grouper object
        g = df.groupby(["A", pd.Grouper(key="B")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group with a Grouper object and a string
        g = df.groupby([pd.Grouper(key="A"), "B"])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # GH8866
        s = Series(
            np.arange(8, dtype="int64"),
            index=pd.MultiIndex.from_product(
                [list("ab"), range(2), date_range("20130101", periods=2)],
                names=["one", "two", "three"],
            ),
        )
        result = s.groupby(pd.Grouper(level="three", freq="M")).sum()
        expected = Series(
            [28], index=Index([Timestamp("2013-01-31")], freq="M", name="three")
        )
        tm.assert_series_equal(result, expected)

        # just specifying a level breaks
        result = s.groupby(pd.Grouper(level="one")).sum()
        expected = s.groupby(level="one").sum()
        tm.assert_series_equal(result, expected)

    def test_grouper_column_and_index(self):
        # GH 14327

        # Grouping a multi-index frame by a column and an index level should
        # be equivalent to resetting the index and grouping by two columns
        idx = pd.MultiIndex.from_tuples(
            [("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)]
        )
        idx.names = ["outer", "inner"]
        df_multi = pd.DataFrame(
            {"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]},
            index=idx,
        )
        result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean()
        expected = df_multi.reset_index().groupby(["B", "inner"]).mean()
        tm.assert_frame_equal(result, expected)

        # Test the reverse grouping order
        result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean()
        expected = df_multi.reset_index().groupby(["inner", "B"]).mean()
        tm.assert_frame_equal(result, expected)

        # Grouping a single-index frame by a column and the index should
        # be equivalent to resetting the index and grouping by two columns
        df_single = df_multi.reset_index("outer")
        result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean()
        expected = df_single.reset_index().groupby(["B", "inner"]).mean()
        tm.assert_frame_equal(result, expected)

        # Test the reverse grouping order
        result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean()
        expected = df_single.reset_index().groupby(["inner", "B"]).mean()
        tm.assert_frame_equal(result, expected)

    def test_groupby_levels_and_columns(self):
        # GH9344, GH9049
        idx_names = ["x", "y"]
        idx = pd.MultiIndex.from_tuples(
            [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names
        )
        df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)

        by_levels = df.groupby(level=idx_names).mean()
        # reset_index changes columns dtype to object
        by_columns = df.reset_index().groupby(idx_names).mean()

        tm.assert_frame_equal(by_levels, by_columns, check_column_type=False)

        by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
        tm.assert_frame_equal(by_levels, by_columns)

    def test_groupby_categorical_index_and_columns(self, observed):
        # GH18432, adapted for GH25871
        columns = ["A", "B", "A", "B"]
        categories = ["B", "A"]
        data = np.array(
            [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int
        )
        cat_columns = CategoricalIndex(columns, categories=categories, ordered=True)
        df = DataFrame(data=data, columns=cat_columns)
        result = df.groupby(axis=1, level=0, observed=observed).sum()
        expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int)
        expected_columns = CategoricalIndex(
            categories, categories=categories, ordered=True
        )
        expected = DataFrame(data=expected_data, columns=expected_columns)
        tm.assert_frame_equal(result, expected)

        # test transposed version
        df = DataFrame(data.T, index=cat_columns)
        result = df.groupby(axis=0, level=0, observed=observed).sum()
        expected = DataFrame(data=expected_data.T, index=expected_columns)
        tm.assert_frame_equal(result, expected)

    def test_grouper_getting_correct_binner(self):

        # GH 10063
        # using a non-time-based grouper and a time-based grouper
        # and specifying levels
        df = DataFrame(
            {"A": 1},
            index=pd.MultiIndex.from_product(
                [list("ab"), date_range("20130101", periods=80)], names=["one", "two"]
            ),
        )
        result = df.groupby(
            [pd.Grouper(level="one"), pd.Grouper(level="two", freq="M")]
        ).sum()
        expected = DataFrame(
            {"A": [31, 28, 21, 31, 28, 21]},
            index=MultiIndex.from_product(
                [list("ab"), date_range("20130101", freq="M", periods=3)],
                names=["one", "two"],
            ),
        )
        tm.assert_frame_equal(result, expected)

    def test_grouper_iter(self, df):
        assert sorted(df.groupby("A").grouper) == ["bar", "foo"]

    def test_empty_groups(self, df):
        # see gh-1048
        with pytest.raises(ValueError, match="No group keys passed!"):
            df.groupby([])

    def test_groupby_grouper(self, df):
        grouped = df.groupby("A")

        result = df.groupby(grouped.grouper).mean()
        expected = grouped.mean()
        tm.assert_frame_equal(result, expected)

    def test_groupby_dict_mapping(self):
        # GH #679
        from pandas import Series

        s = Series({"T1": 5})
        result = s.groupby({"T1": "T2"}).agg(sum)
        expected = s.groupby(["T2"]).agg(sum)
        tm.assert_series_equal(result, expected)

        s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd"))
        mapping = {"a": 0, "b": 0, "c": 1, "d": 1}

        result = s.groupby(mapping).mean()
        result2 = s.groupby(mapping).agg(np.mean)
        expected = s.groupby([0, 0, 1, 1]).mean()
        expected2 = s.groupby([0, 0, 1, 1]).mean()
        tm.assert_series_equal(result, expected)
        tm.assert_series_equal(result, result2)
        tm.assert_series_equal(result, expected2)

    def test_groupby_grouper_f_sanity_checked(self):
        dates = date_range("01-Jan-2013", periods=12, freq="MS")
        ts = Series(np.random.randn(12), index=dates)

        # GH3035
        # index.map is used to apply grouper to the index
        # if it fails on the elements, map tries it on the entire index as
        # a sequence. That can yield invalid results that cause trouble
        # down the line.
        # the surprise comes from using key[0:6] rather then str(key)[0:6]
        # when the elements are Timestamp.
        # the result is Index[0:6], very confusing.

        msg = r"Grouper result violates len\(labels\) == len\(data\)"
        with pytest.raises(AssertionError, match=msg):
            ts.groupby(lambda key: key[0:6])

    def test_grouping_error_on_multidim_input(self, df):
        msg = "Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional"
        with pytest.raises(ValueError, match=msg):
            Grouping(df.index, df[["A", "A"]])

    def test_multiindex_passthru(self):

        # GH 7997
        # regression from 0.14.1
        df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])

        result = df.groupby(axis=1, level=[0, 1]).first()
        tm.assert_frame_equal(result, df)

    def test_multiindex_negative_level(self, mframe):
        # GH 13901
        result = mframe.groupby(level=-1).sum()
        expected = mframe.groupby(level="second").sum()
        tm.assert_frame_equal(result, expected)

        result = mframe.groupby(level=-2).sum()
        expected = mframe.groupby(level="first").sum()
        tm.assert_frame_equal(result, expected)

        result = mframe.groupby(level=[-2, -1]).sum()
        expected = mframe
        tm.assert_frame_equal(result, expected)

        result = mframe.groupby(level=[-1, "first"]).sum()
        expected = mframe.groupby(level=["second", "first"]).sum()
        tm.assert_frame_equal(result, expected)

    def test_multifunc_select_col_integer_cols(self, df):
        df.columns = np.arange(len(df.columns))

        # it works!
        df.groupby(1, as_index=False)[2].agg({"Q": np.mean})

    def test_multiindex_columns_empty_level(self):
        lst = [["count", "values"], ["to filter", ""]]
        midx = MultiIndex.from_tuples(lst)

        df = DataFrame([[1, "A"]], columns=midx)

        grouped = df.groupby("to filter").groups
        assert grouped["A"] == [0]

        grouped = df.groupby([("to filter", "")]).groups
        assert grouped["A"] == [0]

        df = DataFrame([[1, "A"], [2, "B"]], columns=midx)

        expected = df.groupby("to filter").groups
        result = df.groupby([("to filter", "")]).groups
        assert result == expected

        df = DataFrame([[1, "A"], [2, "A"]], columns=midx)

        expected = df.groupby("to filter").groups
        result = df.groupby([("to filter", "")]).groups
        tm.assert_dict_equal(result, expected)

    def test_groupby_multiindex_tuple(self):
        # GH 17979
        df = pd.DataFrame(
            [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
            columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]),
        )
        expected = df.groupby([("b", 1)]).groups
        result = df.groupby(("b", 1)).groups
        tm.assert_dict_equal(expected, result)

        df2 = pd.DataFrame(
            df.values,
            columns=pd.MultiIndex.from_arrays(
                [["a", "b", "b", "c"], ["d", "d", "e", "e"]]
            ),
        )
        expected = df2.groupby([("b", "d")]).groups
        result = df.groupby(("b", 1)).groups
        tm.assert_dict_equal(expected, result)

        df3 = pd.DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"])
        expected = df3.groupby([("b", "d")]).groups
        result = df.groupby(("b", 1)).groups
        tm.assert_dict_equal(expected, result)

    @pytest.mark.parametrize("sort", [True, False])
    def test_groupby_level(self, sort, mframe, df):
        # GH 17537
        frame = mframe
        deleveled = frame.reset_index()

        result0 = frame.groupby(level=0, sort=sort).sum()
        result1 = frame.groupby(level=1, sort=sort).sum()

        expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum()
        expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum()

        expected0.index.name = "first"
        expected1.index.name = "second"

        assert result0.index.name == "first"
        assert result1.index.name == "second"

        tm.assert_frame_equal(result0, expected0)
        tm.assert_frame_equal(result1, expected1)
        assert result0.index.name == frame.index.names[0]
        assert result1.index.name == frame.index.names[1]

        # groupby level name
        result0 = frame.groupby(level="first", sort=sort).sum()
        result1 = frame.groupby(level="second", sort=sort).sum()
        tm.assert_frame_equal(result0, expected0)
        tm.assert_frame_equal(result1, expected1)

        # axis=1

        result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
        result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
        tm.assert_frame_equal(result0, expected0.T)
        tm.assert_frame_equal(result1, expected1.T)

        # raise exception for non-MultiIndex
        msg = "level > 0 or level < -1 only valid with MultiIndex"
        with pytest.raises(ValueError, match=msg):
            df.groupby(level=1)

    def test_groupby_level_index_names(self):
        # GH4014 this used to raise ValueError since 'exp'>1 (in py2)
        df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index(
            "exp"
        )
        df.groupby(level="exp")
        msg = "level name foo is not the name of the index"
        with pytest.raises(ValueError, match=msg):
            df.groupby(level="foo")

    @pytest.mark.parametrize("sort", [True, False])
    def test_groupby_level_with_nas(self, sort):
        # GH 17537
        index = MultiIndex(
            levels=[[1, 0], [0, 1, 2, 3]],
            codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
        )

        # factorizing doesn't confuse things
        s = Series(np.arange(8.0), index=index)
        result = s.groupby(level=0, sort=sort).sum()
        expected = Series([6.0, 22.0], index=[0, 1])
        tm.assert_series_equal(result, expected)

        index = MultiIndex(
            levels=[[1, 0], [0, 1, 2, 3]],
            codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
        )

        # factorizing doesn't confuse things
        s = Series(np.arange(8.0), index=index)
        result = s.groupby(level=0, sort=sort).sum()
        expected = Series([6.0, 18.0], index=[0.0, 1.0])
        tm.assert_series_equal(result, expected)

    def test_groupby_args(self, mframe):
        # PR8618 and issue 8015
        frame = mframe

        msg = "You have to supply one of 'by' and 'level'"
        with pytest.raises(TypeError, match=msg):
            frame.groupby()

        msg = "You have to supply one of 'by' and 'level'"
        with pytest.raises(TypeError, match=msg):
            frame.groupby(by=None, level=None)

    @pytest.mark.parametrize(
        "sort,labels",
        [
            [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
            [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]],
        ],
    )
    def test_level_preserve_order(self, sort, labels, mframe):
        # GH 17537
        grouped = mframe.groupby(level=0, sort=sort)
        exp_labels = np.array(labels, np.intp)
        tm.assert_almost_equal(grouped.grouper.labels[0], exp_labels)

    def test_grouping_labels(self, mframe):
        grouped = mframe.groupby(mframe.index.get_level_values(0))
        exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
        tm.assert_almost_equal(grouped.grouper.labels[0], exp_labels)

    def test_list_grouper_with_nat(self):
        # GH 14715
        df = pd.DataFrame({"date": pd.date_range("1/1/2011", periods=365, freq="D")})
        df.iloc[-1] = pd.NaT
        grouper = pd.Grouper(key="date", freq="AS")

        # Grouper in a list grouping
        result = df.groupby([grouper])
        expected = {pd.Timestamp("2011-01-01"): pd.Index(list(range(364)))}
        tm.assert_dict_equal(result.groups, expected)

        # Test case without a list
        result = df.groupby(grouper)
        expected = {pd.Timestamp("2011-01-01"): 365}
        tm.assert_dict_equal(result.groups, expected)

    @pytest.mark.parametrize(
        "func,expected",
        [
            ("transform", pd.Series(name=2, index=pd.RangeIndex(0, 0, 1))),
            ("agg", pd.Series(name=2, index=pd.Float64Index([], name=1))),
            ("apply", pd.Series(name=2, index=pd.Float64Index([], name=1))),
        ],
    )
    def test_evaluate_with_empty_groups(self, func, expected):
        # 26208
        # test transform'ing empty groups
        # (not testing other agg fns, because they return
        # different index objects.
        df = pd.DataFrame({1: [], 2: []})
        g = df.groupby(1)
        result = getattr(g[2], func)(lambda x: x)
        tm.assert_series_equal(result, expected)

    def test_groupby_empty(self):
        # https://github.com/pandas-dev/pandas/issues/27190
        s = pd.Series([], name="name")
        gr = s.groupby([])

        result = gr.mean()
        tm.assert_series_equal(result, s)

        # check group properties
        assert len(gr.grouper.groupings) == 1
        tm.assert_numpy_array_equal(
            gr.grouper.group_info[0], np.array([], dtype=np.dtype("int64"))
        )

        tm.assert_numpy_array_equal(
            gr.grouper.group_info[1], np.array([], dtype=np.dtype("int"))
        )

        assert gr.grouper.group_info[2] == 0

        # check name
        assert s.groupby(s).grouper.names == ["name"]

    def test_groupby_level_index_value_all_na(self):
        # issue 20519
        df = DataFrame(
            [["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"]
        ).set_index(["A", "B"])
        result = df.groupby(level=["A", "B"]).sum()
        expected = DataFrame(
            data=[],
            index=MultiIndex(
                levels=[Index(["x"], dtype="object"), Index([], dtype="float64")],
                codes=[[], []],
                names=["A", "B"],
            ),
            columns=["C"],
            dtype="int64",
        )
        tm.assert_frame_equal(result, expected)
예제 #29
0
    def robustness_index(self, data, model, targetcol=None, labels=[], nsamples=10, nrecods=100, clevels=[95], random_state=None):
        """
        The function will create nsamples of size nrecords with repetition using bootstrapping.
        :param nsamples:
        :param nrecods:
        :return:
        """
        is_h2o_model = False
        cols = data.columns
        # cbind features and target
        nlen = len(labels)
        if targetcol is None and nlen == 0:
            assert False, "Either targetcol or labels must be specified"
        if nlen > 0:
            if nlen != data.shape[0]:
                assert False, "Number of observations and number of labels must match"
            else:
                if isinstance(cols, pd.RangeIndex):
                    targetcol = pd.Index([len(cols)])
                    cols = pd.RangeIndex(start=0, stop=len(cols) + 1, step=1)
                else:
                    targetcol = 'target'
                    cols.append(targetcol)

                if is_h2o_frame(data):
                    data = h2o.cbind(data, labels)
                else:
                    data = pd.concat([data.reset_index(drop=True), labels], axis=1)

        # check the type of data
        if is_h2o_frame(data):  # if h2o then convert it
            np_data = h2o.as_list(data).values
            col_types = [v for v in data.types.values()]
            is_h2o_model = True
        else:
            np_data = data.values

        for i in range(1, nsamples + 1):
            logging.info("Sampling " + str(i))
            data_boot = resample(np_data, replace=True, n_samples=nrecods, random_state=random_state)
            data_boot_df = pd.DataFrame(data=data_boot[0:, 0:], columns=cols)
            if is_h2o_model:
                data_boot_df = h2o.H2OFrame(data_boot_df, column_types=col_types)
                y_act = h2o.as_list(data_boot_df[targetcol]).values
            else:
                y_act = data_boot_df[len(cols) - 1]

            # remove the target column for predicting as model
            if nlen > 0:
                data_boot_df = data_boot_df.drop([len(cols) - 1], axis=1)

            preds = model.predict(data_boot_df)
            if preds.ndim == 1:
                y_preds = preds
            else:
                y_preds = preds[:, 0]

            # make necessary transformation for h20 frames
            if is_h2o_frame(y_preds):
                y_preds = convert_h2o_list(y_preds)

            if is_h2o_frame(y_act):
                y_act = convert_h2o_list(y_act)

            self.prepare_stats_sample(y_act, y_preds)

        # We have estimations from multiple sample. Now get the mean, se, and CI
        self.prepare_robustness_index(clevels)
        return self.stats_df
예제 #30
0
    def testStringMethod(self):
        s = pd.Series(['a', 'b', 'c'], name='s')
        series = from_pandas_series(s, chunk_size=2)

        with self.assertRaises(AttributeError):
            _ = series.str.non_exist

        r = series.str.contains('c')
        self.assertEqual(r.dtype, np.bool_)
        self.assertEqual(r.name, s.name)
        pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index)
        self.assertEqual(r.shape, s.shape)

        r = r.tiles()
        for i, c in enumerate(r.chunks):
            self.assertEqual(c.index, (i,))
            self.assertEqual(c.dtype, np.bool_)
            self.assertEqual(c.name, s.name)
            pd.testing.assert_index_equal(c.index_value.to_pandas(),
                                          s.index[i * 2: (i + 1) * 2])
            self.assertEqual(c.shape, (2,) if i == 0 else (1,))

        r = series.str.split(',', expand=True, n=1)
        self.assertEqual(r.op.output_types[0], OutputType.dataframe)
        self.assertEqual(r.shape, (3, 2))
        pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index)
        pd.testing.assert_index_equal(r.columns_value.to_pandas(), pd.RangeIndex(2))

        r = r.tiles()
        for i, c in enumerate(r.chunks):
            self.assertEqual(c.index, (i, 0))
            pd.testing.assert_index_equal(c.index_value.to_pandas(),
                                          s.index[i * 2: (i + 1) * 2])
            pd.testing.assert_index_equal(c.columns_value.to_pandas(), pd.RangeIndex(2))
            self.assertEqual(c.shape, (2, 2) if i == 0 else (1, 2))

        with self.assertRaises(TypeError):
            _ = series.str.cat([['1', '2']])

        with self.assertRaises(ValueError):
            _ = series.str.cat(['1', '2'])

        with self.assertRaises(ValueError):
            _ = series.str.cat(',')

        with self.assertRaises(TypeError):
            _ = series.str.cat({'1', '2', '3'})

        r = series.str.cat(sep=',')
        self.assertEqual(r.op.output_types[0], OutputType.scalar)
        self.assertEqual(r.dtype, s.dtype)

        r = r.tiles()
        self.assertEqual(len(r.chunks), 1)
        self.assertEqual(r.chunks[0].op.output_types[0], OutputType.scalar)
        self.assertEqual(r.chunks[0].dtype, s.dtype)

        r = series.str.extract(r'[ab](\d)', expand=False)
        self.assertEqual(r.op.output_types[0], OutputType.series)
        self.assertEqual(r.dtype, s.dtype)

        r = r.tiles()
        for i, c in enumerate(r.chunks):
            self.assertEqual(c.index, (i,))
            self.assertEqual(c.dtype, s.dtype)
            self.assertEqual(c.name, s.name)
            pd.testing.assert_index_equal(c.index_value.to_pandas(),
                                          s.index[i * 2: (i + 1) * 2])
            self.assertEqual(c.shape, (2,) if i == 0 else (1,))

        r = series.str.extract(r'[ab](\d)', expand=True)
        self.assertEqual(r.op.output_types[0], OutputType.dataframe)
        self.assertEqual(r.shape, (3, 1))
        pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index)
        pd.testing.assert_index_equal(r.columns_value.to_pandas(), pd.RangeIndex(1))

        r = r.tiles()
        for i, c in enumerate(r.chunks):
            self.assertEqual(c.index, (i, 0))
            pd.testing.assert_index_equal(c.index_value.to_pandas(),
                                          s.index[i * 2: (i + 1) * 2])
            pd.testing.assert_index_equal(c.columns_value.to_pandas(), pd.RangeIndex(1))
            self.assertEqual(c.shape, (2, 1) if i == 0 else (1, 1))

        self.assertIn('lstrip', dir(series.str))