Exemplo n.º 1
0
def test_dataframe_as_gpu_matrix_null_values():
    df = DataFrame()

    nelem = 123
    na = -10000

    refvalues = {}
    for k in 'abcd':
        df[k] = data = np.random.random(nelem)
        bitmask = utils.random_bitmask(nelem)
        df[k] = df[k].set_mask(bitmask)
        boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:nelem],
                              dtype=np.bool_)
        data[~boolmask] = na
        refvalues[k] = data

    # Check null value causes error
    with pytest.raises(ValueError) as raises:
        df.as_gpu_matrix()
    raises.match("column 'a' has null values")

    for k in df.columns:
        df[k] = df[k].fillna(na)

    mat = df.as_gpu_matrix().copy_to_host()
    for i, k in enumerate(df.columns):
        np.testing.assert_array_equal(refvalues[k], mat[:, i])
Exemplo n.º 2
0
def test_dataframe_setitem_from_masked_object():
    ary = np.random.randn(100)
    mask = np.zeros(100, dtype=bool)
    mask[:20] = True
    np.random.shuffle(mask)
    ary[mask] = np.nan

    test1 = Series(ary)
    assert (test1.has_null_mask)
    assert (test1.null_count == 20)

    test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary}))
    assert (test2['a'].has_null_mask)
    assert (test2['a'].null_count == 20)

    gpu_ary = rmm.to_device(ary)
    test3 = Series(gpu_ary)
    assert (test3.has_null_mask)
    assert (test3.null_count == 20)

    test4 = DataFrame()
    lst = [1, 2, None, 4, 5, 6, None, 8, 9]
    test4['lst'] = lst
    assert (test4['lst'].has_null_mask)
    assert (test4['lst'].null_count == 2)
Exemplo n.º 3
0
def test_dataframe_column_name_indexing():
    df = DataFrame()
    data = np.asarray(range(10), dtype=np.int32)
    df['a'] = data
    df[1] = data
    np.testing.assert_equal(df['a'].to_array(),
                            np.asarray(range(10), dtype=np.int32))
    np.testing.assert_equal(df[1].to_array(),
                            np.asarray(range(10), dtype=np.int32))

    pdf = pd.DataFrame()
    nelem = 10
    pdf['key1'] = np.random.randint(0, 5, nelem)
    pdf['key2'] = np.random.randint(0, 3, nelem)
    pdf[1] = np.arange(1, 1 + nelem)
    pdf[2] = np.random.random(nelem)
    df = DataFrame.from_pandas(pdf)
    for i in range(1, len(pdf.columns)+1):
        for idx in combinations(pdf.columns, i):
            assert(pdf[list(idx)].equals(df[list(idx)].to_pandas()))

    # test for only numeric columns
    df = pd.DataFrame()
    for i in range(0, 10):
        df[i] = range(nelem)
    gdf = DataFrame.from_pandas(df)
    assert_eq(gdf, df)
Exemplo n.º 4
0
 def agg(self, agg_types):
     df = DataFrame()
     by = []
     if self.level is not None:
         if isinstance(self.source_series.index, MultiIndex):
             # Add index columns specified by multiindex into _df
             # Record the index column names for the groupby
             for col in self.source_series.index.codes:
                 df[self.group_name + col] = self.source_series.index.codes[
                         col]
                 by.append(self.group_name + col)
     else:
         if isinstance(self.group_keys, Series):
             df[self.group_name] = self.group_keys
             by = self.group_name
         else:
             df = self.group_keys
             by = self._by
     df[self.source_name] = self.source_series
     groupby = df.groupby(by).agg(agg_types)
     idx = groupby.index
     if len(groupby.columns) == 1:
         result = groupby[self.source_name]
         result.name = self.source_series.name
         idx.name = None
         result = result.set_index(idx)
     else:
         idx.name = self.group_name
         result = groupby.set_index(idx)
     if len(result) == 0 and self._by is not None:
         empties = [[] for x in range(len(self._by))]
         mi = MultiIndex(empties, empties, names=self._by)
         result = result.set_index(mi)
     return result
Exemplo n.º 5
0
    def __getattr__(self, attr):
        df = DataFrame()
        df[self.source_name] = self.source_series
        by = []
        if self.level is not None:
            if isinstance(self.source_series.index, MultiIndex):
                # Add index columns specified by multiindex into _df
                # Record the index column names for the groupby
                for col in self.source_series.index.codes:
                    df[self.group_name +
                       col] = self.source_series.index.codes[col]
                    by.append(self.group_name + col)
        else:
            df[self.group_name] = self.group_series
            by = self.group_name
        groupby = df.groupby(by, level=self.level, sort=self.sort)
        result_df = getattr(groupby, attr)()

        def get_result():
            result_series = result_df[self.source_name]
            result_series.name = self.source_name if self.source_name !=\
                '_x' else None
            idx = result_df.index
            idx.name = self.group_name if self.group_name != '_y' else None
            result_series.set_index(idx)
            return result_series

        return get_result
Exemplo n.º 6
0
def test_assign():
    gdf = DataFrame({'x': [1, 2, 3]})
    gdf2 = gdf.assign(y=gdf.x + 1)
    assert list(gdf.columns) == ['x']
    assert list(gdf2.columns) == ['x', 'y']

    np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
Exemplo n.º 7
0
def test_dataframe_hash_partition(nrows, nparts, nkeys):
    np.random.seed(123)
    gdf = DataFrame()
    keycols = []
    for i in range(nkeys):
        keyname = 'key{}'.format(i)
        gdf[keyname] = np.random.randint(0, 7 - i, nrows)
        keycols.append(keyname)
    gdf['val1'] = np.random.randint(0, nrows * 2, nrows)

    got = gdf.partition_by_hash(keycols, nparts=nparts)
    # Must return a list
    assert isinstance(got, list)
    # Must have correct number of partitions
    assert len(got) == nparts
    # All partitions must be DataFrame type
    assert all(isinstance(p, DataFrame) for p in got)
    # Check that all partitions have unique keys
    part_unique_keys = set()
    for p in got:
        if len(p):
            # Take rows of the keycolums and build a set of the key-values
            unique_keys = set(map(tuple, p.as_matrix(columns=keycols)))
            # Ensure that none of the key-values have occurred in other groups
            assert not (unique_keys & part_unique_keys)
            part_unique_keys |= unique_keys
    assert len(part_unique_keys)
Exemplo n.º 8
0
def test_dataframe_empty_to_string():
    # Test for printing empty dataframe
    df = DataFrame()
    got = df.to_string()
    print(got)
    expect = "Empty DataFrame\nColumns: []\nIndex: []\n"
    # values should match despite whitespace difference
    assert got.split() == expect.split()
Exemplo n.º 9
0
def test_dataframe_iloc_setitem():
    gdf = DataFrame()
    nelem = 123
    gdf['a'] = np.random.randint(low=0, high=100, size=nelem) \
        .astype(np.int32)
    gdf['b'] = np.random.random(nelem).astype(np.float32)

    gdf.iloc[0] = nelem
Exemplo n.º 10
0
def concat(objs, axis=0, ignore_index=False, sort=None):
    """Concatenate DataFrames, Series, or Indices row-wise.

    Parameters
    ----------
    objs : list of DataFrame, Series, or Index
    axis : concatenation axis, 0 - index, 1 - columns
    ignore_index : bool
        Set True to ignore the index of the *objs* and provide a
        default range index instead.

    Returns
    -------
    A new object of like type with rows from each object in ``objs``.
    """
    if sort not in (None, False):
        raise NotImplementedError("sort parameter is not yet supported")

    if not objs:
        raise ValueError("Need at least one object to concatenate")

    # no-op for single object
    if len(objs) == 1:
        return objs[0]

    typs = set(type(o) for o in objs)
    allowed_typs = {Series, DataFrame}
    # when axis is 1 (column) we can concat with Series and Dataframes
    if axis == 1:
        assert typs.issubset(allowed_typs)
        df = DataFrame()
        for idx, o in enumerate(objs):
            if isinstance(o, Series):
                name = o.name
                if o.name is None:
                    # pandas uses 0-offset
                    name = idx - 1
                df[name] = o
            else:
                for col in o.columns:
                    df[col] = o[col]
        return df

    if len(typs) > 1:
        raise ValueError("`concat` expects all objects to be of the same "
                         "type. Got mix of %r." % [t.__name__ for t in typs])
    typ = list(typs)[0]

    if typ is DataFrame:
        return DataFrame._concat(objs, axis=axis, ignore_index=ignore_index)
    elif typ is Series:
        return Series._concat(objs, axis=axis)
    elif issubclass(typ, Index):
        return Index._concat(objs)
    else:
        raise ValueError("Unknown type %r" % typ)
Exemplo n.º 11
0
def test_dataframe_append_to_empty():
    pdf = pd.DataFrame()
    pdf['a'] = []
    pdf['b'] = [1, 2, 3]

    gdf = DataFrame()
    gdf['a'] = []
    gdf['b'] = [1, 2, 3]

    pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
Exemplo n.º 12
0
def test_dataframe_emptycolumns_to_string():
    # Test for printing dataframe having empty columns
    df = DataFrame()
    df['a'] = []
    df['b'] = []
    got = df.to_string()
    print(got)
    expect = "Empty DataFrame\nColumns: ['a', 'b']\nIndex: []\n"
    # values should match despite whitespace difference
    assert got.split() == expect.split()
Exemplo n.º 13
0
def test_dataframe_empty_concat():
    gdf1 = DataFrame()
    gdf1['a'] = []
    gdf1['b'] = []

    gdf2 = gdf1.copy()

    gdf3 = gd.concat([gdf1, gdf2])
    assert len(gdf3) == 0
    assert len(gdf3.columns) == 2
Exemplo n.º 14
0
 def agg(self, agg_types):
     df = DataFrame()
     df['x'] = self.source_series
     if self.level is not None:
         df['y'] = self.source_series.index
     else:
         df['y'] = self.group_series
     groupby = df.groupby('y').agg(agg_types)
     idx = groupby.index
     idx.name = None
     groupby.set_index(idx)
     return groupby
Exemplo n.º 15
0
    def _apply_basic_agg(self, agg_type, sort_results=False):
        """
        Parameters
        ----------
        agg_type : str
            The aggregation function to run.
        """
        result = DataFrame()
        add_col_values = True

        ctx = ffi.new('gdf_context*')
        ctx.flag_sorted = 0
        ctx.flag_method = self._method
        ctx.flag_distinct = 0

        val_columns = self._val_columns
        val_columns_out = self._val_columns

        result = self._apply_agg(agg_type,
                                 result,
                                 add_col_values,
                                 ctx,
                                 val_columns,
                                 val_columns_out,
                                 sort_result=sort_results)

        # If a Groupby has one index column and one value column
        # and as_index is set, return a Series instead of a df
        if isinstance(val_columns, (str, Number)) and self._as_index:
            result_series = result[val_columns]
            idx = index.as_index(result[self._by[0]])
            if self.level == 0:
                idx.name = self._original_index_name
            else:
                idx.name = self._by[0]
            result_series = result_series.set_index(idx)
            return result_series

        # TODO: Do MultiIndex here
        if (self._as_index):
            idx = index.as_index(result[self._by[0]])
            idx.name = self._by[0]
            result.drop_column(idx.name)
            if self.level == 0:
                idx.name = self._original_index_name
            else:
                idx.name = self._by[0]
            result = result.set_index(idx)

        nvtx_range_pop()

        return result
Exemplo n.º 16
0
def test_nonmatching_index_setitem(nrows):
    np.random.seed(0)

    gdf = DataFrame()
    gdf['a'] = np.random.randint(2147483647, size=nrows)
    gdf['b'] = np.random.randint(2147483647, size=nrows)
    gdf = gdf.set_index('b')

    test_values = np.random.randint(2147483647, size=nrows)
    gdf['c'] = test_values
    assert (len(test_values) == len(gdf['c']))
    assert (gdf['c'].to_pandas().equals(
        Series(test_values).set_index(gdf._index).to_pandas()))
Exemplo n.º 17
0
def test_dataframe_masked_slicing(nelem, slice_start, slice_end):
    gdf = DataFrame()
    gdf['a'] = list(range(nelem))
    gdf['b'] = list(range(nelem, 2 * nelem))
    gdf['a'] = gdf['a'].set_mask(utils.random_bitmask(nelem))
    gdf['b'] = gdf['b'].set_mask(utils.random_bitmask(nelem))

    def do_slice(x):
        return x[slice_start:slice_end]

    expect = do_slice(gdf.to_pandas())
    got = do_slice(gdf).to_pandas()

    pd.testing.assert_frame_equal(expect, got)
Exemplo n.º 18
0
 def __getitem__(self, arg):
     if isinstance(arg, (str, Number)):
         if arg not in self._val_columns:
             raise KeyError("Column not found: " + str(arg))
     else:
         for val in arg:
             if val not in self._val_columns:
                 raise KeyError("Column not found: " + str(val))
     result = self.copy(deep=False)
     result._df = DataFrame()
     if isinstance(self._by, (str, Number)):
         result._df[self._by] = self._df[self._by]
     else:
         for by in self._by:
             result._df[by] = self._df[by]
     result._val_columns = arg
     if isinstance(arg, (str, Number)):
         result._df[arg] = self._df[arg]
     else:
         for a in arg:
             result._df[a] = self._df[a]
     if isinstance(result._val_columns,
                   (str, Number)):
         new_by = [result._by] if isinstance(result._by, (str, Number))\
             else list(result._by)
         new_val_columns = [result._val_columns] if\
             isinstance(result._val_columns, (str, Number))\
             else list(result._val_columns)
         new_val_series = result._df[new_val_columns[0]]
         new_val_series.name = new_val_columns[0]
         new_by_keys = result._df[new_by]
         new_by_keys.name = new_by[0]
         return SeriesGroupBy(new_val_series,
                              new_by_keys, by=result._by)
     return result
Exemplo n.º 19
0
def read_feather(path, *args, **kwargs):
    """{docstring}"""

    warnings.warn("Using CPU via PyArrow to read feather dataset, this may "
                  "be GPU accelerated in the future")
    pa_table = feather.read_table(path, *args, **kwargs)
    return DataFrame.from_arrow(pa_table)
Exemplo n.º 20
0
Arquivo: orc.py Projeto: zeichuan/cudf
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    stripe=None,
    skip_rows=None,
    num_rows=None,
    use_index=True,
):
    """{docstring}"""

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        filepath_or_buffer, None)
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    if engine == "cudf":
        df = cpp_read_orc(filepath_or_buffer, columns, stripe, skip_rows,
                          num_rows, use_index)
    else:
        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        orc_file = orc.ORCFile(filepath_or_buffer)
        pa_table = orc_file.read(columns=columns)
        df = DataFrame.from_arrow(pa_table)

    return df
Exemplo n.º 21
0
def concat(objs, ignore_index=False):
    """Concatenate DataFrames, Series, or Indices row-wise.

    Parameters
    ----------
    objs : list of DataFrame, Series, or Index
    ignore_index : bool
        Set True to ignore the index of the *objs* and provide a
        default range index instead.

    Returns
    -------
    A new object of like type with rows from each object in ``objs``.
    """
    if not objs:
        raise ValueError("Need at least one object to concatenate")

    # no-op for single object
    if len(objs) == 1:
        return objs[0]

    typs = set(type(o) for o in objs)
    if len(typs) > 1:
        raise ValueError("`concat` expects all objects to be of the same "
                         "type. Got mix of %r." % [t.__name__ for t in typs])
    typ = list(typs)[0]

    if typ is DataFrame:
        return DataFrame._concat(objs, ignore_index=ignore_index)
    elif typ is Series:
        return Series._concat(objs)
    elif issubclass(typ, Index):
        return Index._concat(objs)
    else:
        raise ValueError("Unknown type %r" % typ)
Exemplo n.º 22
0
def test_dataframe_iloc(nelem):
    gdf = DataFrame()

    gdf['a'] = ha = np.random.randint(low=0, high=100, size=nelem) \
        .astype(np.int32)
    gdf['b'] = hb = np.random.random(nelem).astype(np.float32)

    pdf = pd.DataFrame()
    pdf['a'] = ha
    pdf['b'] = hb

    # Positive tests for slicing using iloc
    def assert_col(g, p):
        np.testing.assert_equal(g['a'].to_array(), p['a'])
        np.testing.assert_equal(g['b'].to_array(), p['b'])

    assert_col(gdf.iloc[-1:1], pdf.iloc[-1:1])
    assert_col(gdf.iloc[nelem-1:-1], pdf.iloc[nelem-1:-1])
    assert_col(gdf.iloc[0:nelem-1], pdf.iloc[0:nelem-1])
    assert_col(gdf.iloc[0:nelem], pdf.iloc[0:nelem])
    assert_col(gdf.iloc[1:1], pdf.iloc[1:1])
    assert_col(gdf.iloc[1:2], pdf.iloc[1:2])
    assert_col(gdf.iloc[nelem-1:nelem+1], pdf.iloc[nelem-1:nelem+1])
    assert_col(gdf.iloc[nelem:nelem*2], pdf.iloc[nelem:nelem*2])

    # Positive tests for int indexing
    def assert_series(g, p):
        np.testing.assert_equal(g.to_array(), p)

    assert_series(gdf.iloc[-1 * nelem], pdf.iloc[-1 * nelem])
    assert_series(gdf.iloc[-1], pdf.iloc[-1])
    assert_series(gdf.iloc[0], pdf.iloc[0])
    assert_series(gdf.iloc[1], pdf.iloc[1])
    assert_series(gdf.iloc[nelem - 1], pdf.iloc[nelem - 1])
Exemplo n.º 23
0
    def _apply_basic_agg(self, agg_type):
        """
        Parameters
        ----------
        agg_type : str
            The aggregation function to run.
        """
        result = DataFrame()
        add_col_values = True

        ctx = ffi.new('gdf_context*')
        ctx.flag_sorted = 0
        ctx.flag_method = self._method
        ctx.flag_distinct = 0

        val_columns = self._val_columns
        val_columns_out = [agg_type + "_" + column for column in val_columns]

        result = self._apply_agg(agg_type,
                                 result,
                                 add_col_values,
                                 ctx,
                                 val_columns,
                                 val_columns_out,
                                 sort_result=False)
        nvtx_range_pop()
        return result
Exemplo n.º 24
0
def test_dataframe_to_string_wide():
    # Test basic
    df = DataFrame()
    for i in range(100):
        df['a{}'.format(i)] = list(range(3))
    got = df.to_string(ncols=8)
    print(got)
    expect = '''
    a0   a1   a2   a3   a4   a5   a6 ...  a99
0    0    0    0    0    0    0    0 ...    0
1    1    1    1    1    1    1    1 ...    1
2    2    2    2    2    2    2    2 ...    2
[92 more columns]
'''
    # values should match despite whitespace difference
    assert got.split() == expect.split()
Exemplo n.º 25
0
def test_dataframe_hash_partition_masked_value(nrows):
    gdf = DataFrame()
    gdf['key'] = np.arange(nrows)
    gdf['val'] = np.arange(nrows) + 100
    bitmask = utils.random_bitmask(nrows)
    bytemask = utils.expand_bits_to_bytes(bitmask)
    gdf['val'] = gdf['val'].set_mask(bitmask)
    parted = gdf.partition_by_hash(['key'], nparts=3)
    # Verify that the valid mask is correct
    for p in parted:
        df = p.to_pandas()
        for row in df.itertuples():
            valid = bool(bytemask[row.key])
            expected_value = row.key + 100 if valid else -1
            got_value = row.val
            assert expected_value == got_value
Exemplo n.º 26
0
def test_dataframe_copy_shallow():
    # Test for copy dataframe using class method
    df = DataFrame()
    df['a'] = [1, 2, 3]
    df2 = df.copy()
    df2['b'] = [4, 2, 3]
    got = df.to_string()
    print(got)
    expect = '''
     a
0    1
1    2
2    3
'''
    # values should match despite whitespace difference
    assert got.split() == expect.split()
Exemplo n.º 27
0
def read_parquet(path, *args, **kwargs):
    """{docstring}"""

    warnings.warn("Using CPU via PyArrow to read Parquet dataset, this will "
                  "be GPU accelerated in the future")
    pa_table = pq.read_pandas(path, *args, **kwargs)
    return DataFrame.from_arrow(pa_table)
Exemplo n.º 28
0
def read_orc(path, columns=None, **kwargs):
    """{docstring}"""
    warnings.warn("Using CPU via PyArrow to read ORC dataset, this will "
                  "be GPU accelerated in the future")
    orc_file = orc.ORCFile(path)
    pa_table = orc_file.read(columns=columns)
    return DataFrame.from_arrow(pa_table)
Exemplo n.º 29
0
def test_dataframe_to_string():
    with set_options(formatting={'nrows': 5, 'ncols': 8}):
        # Test basic
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16])])
        string = str(df)
        print(string)
        assert string.splitlines()[-1] == '[1 more rows]'

        # Test skipped columns
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16]),
                        ('c', [11, 12, 13, 14, 15, 16]),
                        ('d', [11, 12, 13, 14, 15, 16])])
        string = df.to_string(ncols=3)
        print(string)
        assert string.splitlines()[-2] == '[1 more rows]'
        assert string.splitlines()[-1] == '[1 more columns]'

        # Test masked
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16])])

        data = np.arange(6)
        mask = np.zeros(1, dtype=np.uint8)
        mask[0] = 0b00101101

        masked = Series.from_masked_array(data, mask)
        assert masked.null_count == 2
        df['c'] = masked

        # check data
        values = list(masked)
        validids = [0, 2, 3, 5]
        densearray = masked.to_array()
        np.testing.assert_equal(data[validids], densearray)
        # valid position is corret
        for i in validids:
            assert data[i] == values[i]
        # null position is correct
        for i in range(len(values)):
            if i not in validids:
                assert values[i] is None

        got = df.to_string(nrows=None)
        print(got)
        expect = '''
  a b  c
0 1 11 0
1 2 12
2 3 13 2
3 4 14 3
4 5 15
5 6 16 5
'''
        # values should match despite whitespace difference
        assert got.split() == expect.split()
Exemplo n.º 30
0
def test_kernel_shallow_copy():
    pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                       columns=['a', 'b', 'c'])
    gdf = DataFrame.from_pandas(pdf)
    cdf = gdf.copy(deep=False)
    sr = gdf['a']
    add_one[1, len(sr)](sr.to_gpu_array())
    assert_eq(gdf, cdf)