Пример #1
0
    def testIndexReduction(self):
        rs = np.random.RandomState(0)
        data = pd.Index(rs.randint(0, 5, (100, )))
        data2 = pd.Index(rs.randint(1, 6, (100, )))

        for method in ['min', 'max', 'all', 'any']:
            idx = md.Index(data)
            result = self.executor.execute_dataframe(getattr(idx, method)(),
                                                     concat=True)[0]
            self.assertEqual(result, getattr(data, method)())

            idx = md.Index(data, chunk_size=10)
            result = self.executor.execute_dataframe(getattr(idx, method)(),
                                                     concat=True)[0]
            self.assertEqual(result, getattr(data, method)())

            idx = md.Index(data2)
            result = self.executor.execute_dataframe(getattr(idx, method)(),
                                                     concat=True)[0]
            self.assertEqual(result, getattr(data2, method)())

            idx = md.Index(data2, chunk_size=10)
            result = self.executor.execute_dataframe(getattr(idx, method)(),
                                                     concat=True)[0]
            self.assertEqual(result, getattr(data2, method)())
Пример #2
0
    def testIndexInitializer(self):
        def _concat_idx(results):
            s_results = [pd.Series(idx) for idx in results]
            return pd.Index(pd.concat(s_results))

        # from tensor
        raw = np.arange(100)
        np.random.shuffle(raw)
        tensor = mt.tensor(raw)
        r = md.Index(tensor, chunk_size=7)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_index_equal(result, pd.Index(raw))

        # from Mars index
        raw = np.arange(100)
        np.random.shuffle(raw)
        idx = md.Index(raw, chunk_size=7)
        r = md.Index(idx, num_partitions=11)
        results = self.executor.execute_dataframe(r)
        self.assertEqual(len(results), 10)
        pd.testing.assert_index_equal(_concat_idx(results), pd.Index(raw))

        # from pandas initializer
        raw = np.arange(100)
        np.random.shuffle(raw)
        raw_ser = pd.Series(raw, name='series_name')
        r = md.Index(raw_ser, chunk_size=7)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_index_equal(result, pd.Index(raw_ser))

        raw_idx = pd.Index(raw, name='idx_name')
        r = md.Index(raw_idx, num_partitions=10)
        results = self.executor.execute_dataframe(r)
        self.assertEqual(len(results), 10)
        pd.testing.assert_index_equal(_concat_idx(results), pd.Index(raw_idx))
Пример #3
0
def test_index_initializer(setup):
    # from tensor
    raw = np.arange(100)
    np.random.shuffle(raw)
    tensor = mt.tensor(raw)
    r = md.Index(tensor, chunk_size=7)
    result = r.execute().fetch()
    pd.testing.assert_index_equal(result, pd.Index(raw))

    # from Mars index
    raw = np.arange(100)
    np.random.shuffle(raw)
    idx = md.Index(raw, chunk_size=7)
    r = md.Index(idx, num_partitions=11)
    result = r.execute().fetch()
    pd.testing.assert_index_equal(result, pd.Index(raw))

    # from pandas initializer
    raw = np.arange(100)
    np.random.shuffle(raw)
    raw_ser = pd.Series(raw, name='series_name')
    r = md.Index(raw_ser, chunk_size=7)
    result = r.execute().fetch()
    pd.testing.assert_index_equal(result, pd.Index(raw_ser))

    raw_idx = pd.Index(raw, name='idx_name')
    r = md.Index(raw_idx, num_partitions=10)
    result = r.execute().fetch()
    pd.testing.assert_index_equal(result, pd.Index(raw_idx))
Пример #4
0
    def testInitializerExecution(self):
        arr = np.random.rand(20, 30)

        pdf = pd.DataFrame(arr, index=[np.arange(20), np.arange(20, 0, -1)])
        df = md.DataFrame(pdf, chunk_size=(15, 10))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

        df = md.DataFrame(arr, index=md.date_range('2020-1-1', periods=20))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(
            result,
            pd.DataFrame(arr, index=pd.date_range('2020-1-1', periods=20)))

        s = np.random.rand(20)

        ps = pd.Series(s,
                       index=[np.arange(20),
                              np.arange(20, 0, -1)],
                       name='a')
        series = md.Series(ps, chunk_size=7)
        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

        series = md.Series(s, index=md.date_range('2020-1-1', periods=20))
        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(
            result, pd.Series(s, index=pd.date_range('2020-1-1', periods=20)))

        pi = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
        index = md.Index(md.Index(pi))
        result = self.executor.execute_dataframe(index, concat=True)[0]
        pd.testing.assert_index_equal(pi, result)
Пример #5
0
def test_index_gpu_initializer(setup_gpu):
    # from raw cudf initializer
    raw = cudf.Index(cupy.random.rand(100), name='a')
    r = md.Index(raw, chunk_size=13)
    result = r.execute().fetch()
    pd.testing.assert_index_equal(result.to_pandas(), raw.to_pandas())

    raw = cupy.random.rand(100)
    r = md.Index(raw, name='a', chunk_size=13)
    result = r.execute().fetch()
    expected = cudf.Index(raw, name='a')
    pd.testing.assert_index_equal(result.to_pandas(), expected.to_pandas())
Пример #6
0
def test_check_na_execution(setup):
    df_raw = pd.DataFrame(np.nan,
                          index=range(0, 20),
                          columns=list('ABCDEFGHIJ'))
    for _ in range(20):
        df_raw.iloc[random.randint(0, 19),
                    random.randint(0, 9)] = random.randint(0, 99)

    df = md.DataFrame(df_raw, chunk_size=4)

    pd.testing.assert_frame_equal(df.isna().execute().fetch(), df_raw.isna())
    pd.testing.assert_frame_equal(df.notna().execute().fetch(), df_raw.notna())

    series_raw = pd.Series(np.nan, index=range(20))
    for _ in range(3):
        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)

    series = md.Series(series_raw, chunk_size=4)

    pd.testing.assert_series_equal(series.isna().execute().fetch(),
                                   series_raw.isna())
    pd.testing.assert_series_equal(series.notna().execute().fetch(),
                                   series_raw.notna())

    idx_data = np.array([np.nan] * 20)
    for _ in range(3):
        idx_data[random.randint(0, 19)] = random.randint(0, 99)
    idx_raw = pd.Index(idx_data)

    idx = md.Index(idx_raw, chunk_size=4)

    np.testing.assert_array_equal(idx.isna().execute().fetch(), idx_raw.isna())
    np.testing.assert_array_equal(idx.notna().execute().fetch(),
                                  idx_raw.notna())
Пример #7
0
    def testSeriesInitializer(self):
        # from tensor
        raw = np.random.rand(100)
        tensor = mt.tensor(raw, chunk_size=7)
        r = md.Series(tensor)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, pd.Series(raw))

        r = md.Series(tensor, chunk_size=13)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, pd.Series(raw))

        # from index
        raw = np.arange(100)
        np.random.shuffle(raw)
        raw = pd.Index(raw, name='idx_name')
        idx = md.Index(raw, chunk_size=7)
        r = md.Series(idx)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, pd.Series(raw))

        # from Mars series
        raw = pd.Series(np.random.rand(100), name='series_name')
        ms = md.Series(raw, chunk_size=15) * 2
        r = md.Series(ms, num_partitions=11)
        results = self.executor.execute_dataframe(r)
        self.assertEqual(len(results), 10)
        pd.testing.assert_series_equal(pd.concat(results), raw * 2)

        # from raw pandas initializer
        raw = pd.Series(np.random.rand(100), name='series_name')
        r = md.Series(raw, num_partitions=10)
        results = self.executor.execute_dataframe(r)
        self.assertEqual(len(results), 10)
        pd.testing.assert_series_equal(pd.concat(results), raw)
Пример #8
0
def test_index_only(setup):
    df = md.DataFrame(index=[1, 2, 3])
    pd.testing.assert_frame_equal(df.execute().fetch(),
                                  pd.DataFrame(index=[1, 2, 3]))

    s = md.Series(index=[1, 2, 3])
    pd.testing.assert_series_equal(s.execute().fetch(),
                                   pd.Series(index=[1, 2, 3]))

    df = md.DataFrame(index=md.Index([1, 2, 3]))
    pd.testing.assert_frame_equal(df.execute().fetch(),
                                  pd.DataFrame(index=[1, 2, 3]))

    s = md.Series(index=md.Index([1, 2, 3]), dtype=object)
    pd.testing.assert_series_equal(s.execute().fetch(),
                                   pd.Series(index=[1, 2, 3], dtype=object))
Пример #9
0
def test_drop_na_execution(setup):
    # dataframe cases
    df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ'))
    for _ in range(30):
        df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)
    for rowid in range(random.randint(1, 5)):
        row = random.randint(0, 19)
        for idx in range(0, 10):
            df_raw.iloc[row, idx] = random.randint(0, 99)

    # only one chunk in columns, can run dropna directly
    r = md.DataFrame(df_raw, chunk_size=(4, 10)).dropna()
    pd.testing.assert_frame_equal(r.execute().fetch(),
                                  df_raw.dropna())

    # multiple chunks in columns, count() will be called first
    r = md.DataFrame(df_raw, chunk_size=4).dropna()
    pd.testing.assert_frame_equal(r.execute().fetch(),
                                  df_raw.dropna())

    r = md.DataFrame(df_raw, chunk_size=4).dropna(how='all')
    pd.testing.assert_frame_equal(r.execute().fetch(),
                                  df_raw.dropna(how='all'))

    r = md.DataFrame(df_raw, chunk_size=4).dropna(subset=list('ABFI'))
    pd.testing.assert_frame_equal(r.execute().fetch(),
                                  df_raw.dropna(subset=list('ABFI')))

    r = md.DataFrame(df_raw, chunk_size=4).dropna(how='all', subset=list('BDHJ'))
    pd.testing.assert_frame_equal(r.execute().fetch(),
                                  df_raw.dropna(how='all', subset=list('BDHJ')))

    r = md.DataFrame(df_raw, chunk_size=4)
    r.dropna(how='all', inplace=True)
    pd.testing.assert_frame_equal(r.execute().fetch(),
                                  df_raw.dropna(how='all'))

    # series cases
    series_raw = pd.Series(np.nan, index=range(20))
    for _ in range(10):
        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)

    r = md.Series(series_raw, chunk_size=4).dropna()
    pd.testing.assert_series_equal(r.execute().fetch(),
                                   series_raw.dropna())

    r = md.Series(series_raw, chunk_size=4)
    r.dropna(inplace=True)
    pd.testing.assert_series_equal(r.execute().fetch(),
                                   series_raw.dropna())

    # index cases
    idx_data = np.array([np.nan] * 20)
    for _ in range(10):
        idx_data[random.randint(0, 19)] = random.randint(0, 99)
    idx_raw = pd.Index(idx_data)

    r = md.Index(idx_raw, chunk_size=4).dropna()
    pd.testing.assert_index_equal(r.execute().fetch(),
                                  idx_raw.dropna())
Пример #10
0
    def testFromDataFrameExecution(self):
        mdf = md.DataFrame({
            'angle': [0, 3, 4],
            'degree': [360, 180, 360]
        },
                           index=['circle', 'triangle', 'rectangle'])
        tensor_result = self.executor.execute_tensor(from_dataframe(mdf))
        tensor_expected = self.executor.execute_tensor(
            mt.tensor([[0, 360], [3, 180], [4, 360]]))
        np.testing.assert_equal(tensor_result, tensor_expected)

        # test up-casting
        mdf2 = md.DataFrame({'a': [0.1, 0.2, 0.3], 'b': [1, 2, 3]})
        tensor_result2 = self.executor.execute_tensor(from_dataframe(mdf2))
        np.testing.assert_equal(tensor_result2[0].dtype, np.dtype('float64'))
        tensor_expected2 = self.executor.execute_tensor(
            mt.tensor([[0.1, 1.0], [0.2, 2.0], [0.3, 3.0]]))
        np.testing.assert_equal(tensor_result2, tensor_expected2)

        raw = [[0.1, 0.2, 0.4], [0.4, 0.7, 0.3]]
        mdf3 = md.DataFrame(raw, columns=list('abc'), chunk_size=2)
        tensor_result3 = self.executor.execute_tensor(from_dataframe(mdf3),
                                                      concat=True)[0]
        np.testing.assert_array_equal(tensor_result3, np.asarray(raw))
        self.assertTrue(tensor_result3.flags['F_CONTIGUOUS'])
        self.assertFalse(tensor_result3.flags['C_CONTIGUOUS'])

        # test from series
        series = md.Series([1, 2, 3])
        tensor_result = series.to_tensor().execute()
        np.testing.assert_array_equal(tensor_result, np.array([1, 2, 3]))

        series = md.Series(range(10), chunk_size=3)
        tensor_result = series.to_tensor().execute()
        np.testing.assert_array_equal(tensor_result, np.arange(10))

        # test from index
        index = md.Index(pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)]))
        tensor_result = index.to_tensor(extract_multi_index=True).execute()
        np.testing.assert_array_equal(tensor_result,
                                      np.arange(6).reshape((3, 2)))

        index = md.Index(pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)]))
        tensor_result = index.to_tensor(extract_multi_index=False).execute()
        np.testing.assert_array_equal(
            tensor_result,
            pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)]).to_series())
Пример #11
0
def test_index_fill_na_execution(setup):
    idx_data = np.array([np.nan] * 20)
    for _ in range(10):
        idx_data[random.randint(0, 19)] = random.randint(0, 99)
    idx_raw = pd.Index(idx_data)

    # test single chunk
    idx = md.Index(idx_raw)

    r = idx.fillna(1)
    pd.testing.assert_index_equal(r.execute().fetch(), idx_raw.fillna(1))

    idx = md.Index(idx_raw, chunk_size=3)

    # test chunked with numeric fill
    r = idx.fillna(1)
    pd.testing.assert_index_equal(r.execute().fetch(), idx_raw.fillna(1))
Пример #12
0
    def testRepr(self):
        # test tensor repr
        with np.printoptions(threshold=100):
            arr = np.random.randint(1000, size=(11, 4, 13))

            t = mt.tensor(arr, chunk_size=3)

            result = repr(t.execute())
            expected = repr(arr)
            self.assertEqual(result, expected)

        for size in (5, 58, 60, 62, 64):
            pdf = pd.DataFrame(np.random.randint(1000, size=(size, 10)))

            # test DataFrame repr
            df = md.DataFrame(pdf, chunk_size=size // 2)

            result = repr(df.execute())
            expected = repr(pdf)
            self.assertEqual(result, expected,
                             f'failed repr for DataFrame when size = {size}')

            # test DataFrame _repr_html_
            result = df.execute()._repr_html_()
            expected = pdf._repr_html_()
            self.assertEqual(
                result, expected,
                f'failed repr html for DataFrame when size = {size}')

            # test Series repr
            ps = pdf[0]
            s = md.Series(ps, chunk_size=size // 2)

            result = repr(s.execute())
            expected = repr(ps)
            self.assertEqual(result, expected,
                             f'failed repr for Series when size = {size}')

        # test Index repr
        pind = pd.date_range('2020-1-1', periods=10)
        ind = md.Index(pind, chunk_size=5)

        self.assertIn('DatetimeIndex', repr(ind.execute()))

        # test groupby repr
        df = md.DataFrame(
            pd.DataFrame(np.random.rand(100, 3), columns=list('abc')))
        grouped = df.groupby(['a', 'b']).execute()

        self.assertIn('DataFrameGroupBy', repr(grouped))

        # test Categorical repr
        c = md.qcut(range(5), 3)
        self.assertIn('Categorical', repr(c))
        self.assertIn('Categorical', str(c))
        self.assertEqual(repr(c.execute()), repr(pd.qcut(range(5), 3)))
Пример #13
0
    def testIndexOnly(self):
        df = md.DataFrame(index=[1, 2, 3])
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df, concat=True)[0],
            pd.DataFrame(index=[1, 2, 3]))

        s = md.Series(index=[1, 2, 3])
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(s, concat=True)[0],
            pd.Series(index=[1, 2, 3]))

        df = md.DataFrame(index=md.Index([1, 2, 3]))
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df, concat=True)[0],
            pd.DataFrame(index=[1, 2, 3]))

        s = md.Series(index=md.Index([1, 2, 3]), dtype=object)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(s, concat=True)[0],
            pd.Series(index=[1, 2, 3], dtype=object))
Пример #14
0
    def testIndexFillNAExecution(self):
        idx_data = np.array([np.nan] * 20)
        for _ in range(10):
            idx_data[random.randint(0, 19)] = random.randint(0, 99)
        idx_raw = pd.Index(idx_data)

        # test single chunk
        idx = md.Index(idx_raw)

        r = idx.fillna(1)
        pd.testing.assert_index_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            idx_raw.fillna(1))

        idx = md.Index(idx_raw, chunk_size=3)

        # test chunked with numeric fill
        r = idx.fillna(1)
        pd.testing.assert_index_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            idx_raw.fillna(1))
Пример #15
0
def test_index_reduction(setup, check_ref_counts):
    rs = np.random.RandomState(0)
    data = pd.Index(rs.randint(0, 5, (100, )))
    data2 = pd.Index(rs.randint(1, 6, (100, )))

    for method in ['min', 'max', 'all', 'any']:
        idx = md.Index(data)
        result = getattr(idx, method)().execute().fetch()
        assert result == getattr(data, method)()

        idx = md.Index(data, chunk_size=10)
        result = getattr(idx, method)().execute().fetch()
        assert result == getattr(data, method)()

        idx = md.Index(data2)
        result = getattr(idx, method)().execute().fetch()
        assert result == getattr(data2, method)()

        idx = md.Index(data2, chunk_size=10)
        result = getattr(idx, method)().execute().fetch()
        assert result == getattr(data2, method)()
Пример #16
0
def test_repr(setup):
    # test tensor repr
    with np.printoptions(threshold=100):
        arr = np.random.randint(1000, size=(11, 4, 13))

        t = mt.tensor(arr, chunk_size=3)

        result = repr(t.execute())
        expected = repr(arr)
        assert result == expected

    for size in (5, 58, 60, 62, 64):
        pdf = pd.DataFrame(np.random.randint(1000, size=(size, 10)))

        # test DataFrame repr
        df = md.DataFrame(pdf, chunk_size=size // 2)

        result = repr(df.execute())
        expected = repr(pdf)
        assert result == expected

        # test DataFrame _repr_html_
        result = df.execute()._repr_html_()
        expected = pdf._repr_html_()
        assert result == expected

        # test Series repr
        ps = pdf[0]
        s = md.Series(ps, chunk_size=size // 2)

        result = repr(s.execute())
        expected = repr(ps)
        assert result == expected

    # test Index repr
    pind = pd.date_range('2020-1-1', periods=10)
    ind = md.Index(pind, chunk_size=5)

    assert 'DatetimeIndex' in repr(ind.execute())

    # test groupby repr
    df = md.DataFrame(pd.DataFrame(np.random.rand(100, 3),
                                   columns=list('abc')))
    grouped = df.groupby(['a', 'b']).execute()

    assert 'DataFrameGroupBy' in repr(grouped)

    # test Categorical repr
    c = md.qcut(range(5), 3)
    assert 'Categorical' in repr(c)
    assert 'Categorical' in str(c)
    assert repr(c.execute()) == repr(pd.qcut(range(5), 3))
Пример #17
0
def test_initializer_execution(setup):
    arr = np.random.rand(20, 30)

    pdf = pd.DataFrame(arr, index=[np.arange(20), np.arange(20, 0, -1)])
    df = md.DataFrame(pdf, chunk_size=(15, 10))
    result = df.execute().fetch()
    pd.testing.assert_frame_equal(pdf, result)

    df = md.DataFrame(arr, index=md.date_range('2020-1-1', periods=20))
    result = df.execute().fetch()
    pd.testing.assert_frame_equal(
        result, pd.DataFrame(arr, index=pd.date_range('2020-1-1', periods=20)))

    df = md.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},
                      index=md.date_range('1/1/2010', periods=6, freq='D'))
    result = df.execute().fetch()
    pd.testing.assert_frame_equal(
        result,
        pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},
                     index=pd.date_range('1/1/2010', periods=6, freq='D')))

    s = np.random.rand(20)

    ps = pd.Series(s, index=[np.arange(20), np.arange(20, 0, -1)], name='a')
    series = md.Series(ps, chunk_size=7)
    result = series.execute().fetch()
    pd.testing.assert_series_equal(ps, result)

    series = md.Series(s, index=md.date_range('2020-1-1', periods=20))
    result = series.execute().fetch()
    pd.testing.assert_series_equal(
        result, pd.Series(s, index=pd.date_range('2020-1-1', periods=20)))

    pi = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
    index = md.Index(md.Index(pi))
    result = index.execute().fetch()
    pd.testing.assert_index_equal(pi, result)
Пример #18
0
    def testRepr(self):
        # test tensor repr
        with np.printoptions(threshold=100):
            arr = np.random.randint(1000, size=(11, 4, 13))

            t = mt.tensor(arr, chunk_size=3)

            result = repr(t.execute())
            expected = repr(arr)
            self.assertEqual(result, expected)

        for size in (5, 58, 60, 62, 64):
            pdf = pd.DataFrame(np.random.randint(1000, size=(size, 10)))

            # test DataFrame repr
            df = md.DataFrame(pdf, chunk_size=size // 2)

            result = repr(df.execute())
            expected = repr(pdf)
            self.assertEqual(
                result, expected,
                'failed repr for DataFrame when size = {}'.format(size))

            # test DataFrame _repr_html_
            result = df.execute()._repr_html_()
            expected = pdf._repr_html_()
            self.assertEqual(
                result, expected,
                'failed repr html for DataFrame when size = {}'.format(size))

            # test Series repr
            ps = pdf[0]
            s = md.Series(ps, chunk_size=size // 2)

            result = repr(s.execute())
            expected = repr(ps)
            self.assertEqual(
                result, expected,
                'failed repr for Series when size = {}'.format(size))

        # test Index repr
        pind = pd.date_range('2020-1-1', periods=10)
        ind = md.Index(pind, chunk_size=5)

        self.assertIn('DatetimeIndex', repr(ind.execute()))
Пример #19
0
    def testCheckNAExecution(self):
        df_raw = pd.DataFrame(np.nan,
                              index=range(0, 20),
                              columns=list('ABCDEFGHIJ'))
        for _ in range(20):
            df_raw.iloc[random.randint(0, 19),
                        random.randint(0, 9)] = random.randint(0, 99)

        df = md.DataFrame(df_raw, chunk_size=4)

        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df.isna(), concat=True)[0],
            df_raw.isna())
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df.notna(), concat=True)[0],
            df_raw.notna())

        series_raw = pd.Series(np.nan, index=range(20))
        for _ in range(3):
            series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)

        series = md.Series(series_raw, chunk_size=4)

        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series.isna(), concat=True)[0],
            series_raw.isna())
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series.notna(), concat=True)[0],
            series_raw.notna())

        idx_data = np.array([np.nan] * 20)
        for _ in range(3):
            idx_data[random.randint(0, 19)] = random.randint(0, 99)
        idx_raw = pd.Index(idx_data)

        idx = md.Index(idx_raw, chunk_size=4)

        np.testing.assert_array_equal(
            self.executor.execute_dataframe(idx.isna(), concat=True)[0],
            idx_raw.isna())
        np.testing.assert_array_equal(
            self.executor.execute_dataframe(idx.notna(), concat=True)[0],
            idx_raw.notna())
Пример #20
0
def test_series_initializer(setup):
    # from tensor
    raw = np.random.rand(100)
    tensor = mt.tensor(raw, chunk_size=7)
    r = md.Series(tensor)
    result = r.execute().fetch()
    pd.testing.assert_series_equal(result, pd.Series(raw))

    r = md.Series(tensor, chunk_size=13)
    result = r.execute().fetch()
    pd.testing.assert_series_equal(result, pd.Series(raw))

    # from index
    raw = np.arange(100)
    np.random.shuffle(raw)
    raw = pd.Index(raw, name='idx_name')
    idx = md.Index(raw, chunk_size=7)
    r = md.Series(idx)
    result = r.execute().fetch()
    pd.testing.assert_series_equal(result, pd.Series(raw))

    # from Mars series
    raw = pd.Series(np.random.rand(100), name='series_name')
    ms = md.Series(raw, chunk_size=15) * 2
    r = md.Series(ms, num_partitions=11)
    result = r.execute().fetch()
    pd.testing.assert_series_equal(result, raw * 2)

    # from raw pandas initializer
    raw = pd.Series(np.random.rand(100), name='series_name')
    r = md.Series(raw, num_partitions=10)
    result = r.execute().fetch()
    pd.testing.assert_series_equal(result, raw)

    # test check instance
    r = r * 2
    assert isinstance(r, md.Series)
Пример #21
0
    def testRename(self):
        rs = np.random.RandomState(0)
        raw = pd.DataFrame(rs.rand(10, 4), columns=['A', 'B', 'C', 'D'])
        df = md.DataFrame(raw, chunk_size=3)

        with self.assertWarns(Warning):
            df.rename(str, errors='raise')

        with self.assertRaises(NotImplementedError):
            df.rename({"A": "a", "B": "b"}, axis=1, copy=False)

        r = df.rename(str)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.rename(str))

        r = df.rename({"A": "a", "B": "b"}, axis=1)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.rename({
                "A": "a",
                "B": "b"
            }, axis=1))

        df.rename({"A": "a", "B": "b"}, axis=1, inplace=True)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df, concat=True)[0],
            raw.rename({
                "A": "a",
                "B": "b"
            }, axis=1))

        raw = pd.DataFrame(rs.rand(10, 4),
                           columns=pd.MultiIndex.from_tuples(
                               (('A', 'C'), ('A', 'D'), ('B', 'E'), ('B',
                                                                     'F'))))
        df = md.DataFrame(raw, chunk_size=3)

        r = df.rename({"C": "a", "D": "b"}, level=1, axis=1)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.rename({
                "C": "a",
                "D": "b"
            }, level=1, axis=1))

        raw = pd.Series(rs.rand(10), name='series')
        series = md.Series(raw, chunk_size=3)

        r = series.rename('new_series')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.rename('new_series'))

        r = series.rename(lambda x: 2**x)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.rename(lambda x: 2**x))

        with self.assertRaises(TypeError):
            series.name = {1: 10, 2: 20}

        series.name = 'new_series'
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            raw.rename('new_series'))

        raw = pd.MultiIndex.from_frame(
            pd.DataFrame(rs.rand(10, 2), columns=['A', 'B']))
        idx = md.Index(raw)

        r = idx.rename(['C', 'D'])
        pd.testing.assert_index_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.rename(['C', 'D']))