Exemplo n.º 1
0
 def __getitem__(self, arg):
     from cudf.dataframe.series import Series
     from cudf.dataframe.index import Index
     if isinstance(
             arg,
         (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)):
         if len(arg) == 0:
             arg = Series(np.array([], dtype='int32'))
         else:
             arg = Series(arg)
     if isinstance(arg, Series):
         if arg.dtype in [np.bool, np.bool_]:
             return self._sr.iloc[arg]
         # To do this efficiently we need a solution to
         # https://github.com/rapidsai/cudf/issues/1087
         out = Series([],
                      dtype=self._sr.dtype,
                      index=self._sr.index.__class__([]))
         for s in arg:
             out = out.append(self._sr.loc[s:s], ignore_index=False)
         return out
     elif is_single_value(arg):
         found_index = self._sr.index.find_label_range(arg, None)[0]
         return self._sr.iloc[found_index]
     elif isinstance(arg, slice):
         start_index, stop_index = self._sr.index.find_label_range(
             arg.start, arg.stop)
         return self._sr.iloc[start_index:stop_index:arg.step]
     else:
         raise NotImplementedError(
             ".loc not implemented for label type {}".format(
                 type(arg).__name__))
Exemplo n.º 2
0
    def _loc_to_iloc(self, arg):
        from cudf.dataframe.series import Series
        from cudf.dataframe.index import Index

        if isinstance(
            arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)
        ):
            if len(arg) == 0:
                arg = Series(np.array([], dtype="int32"))
            else:
                arg = Series(arg)
        if isinstance(arg, Series):
            if arg.dtype in [np.bool, np.bool_]:
                return arg
            else:
                return indices_from_labels(self._sr, arg)
        elif is_scalar(arg):
            found_index = self._sr.index.find_label_range(arg, None)[0]
            return found_index
        elif isinstance(arg, slice):
            start_index, stop_index = self._sr.index.find_label_range(
                arg.start, arg.stop
            )
            return slice(start_index, stop_index, arg.step)
        else:
            raise NotImplementedError(
                ".loc not implemented for label type {}".format(
                    type(arg).__name__
                )
            )
Exemplo n.º 3
0
    def _compute_validity_mask(self, index, row_tuple, max_length):
        """ Computes the valid set of indices of values in the lookup
        """
        from cudf import DataFrame
        from cudf import Series
        from cudf import concat
        from cudf.utils.cudautils import arange

        lookup = DataFrame()
        for idx, row in enumerate(row_tuple):
            if row == slice(None):
                continue
            lookup[index._source_data.columns[idx]] = Series(row)
        data_table = concat(
            [
                index._source_data,
                DataFrame({"idx": Series(arange(len(index._source_data)))}),
            ],
            axis=1,
        )
        result = lookup.merge(data_table)["idx"]
        # Avoid computing levels unless the result of the merge is empty,
        # which suggests that a KeyError should be raised.
        if len(result) == 0:
            for idx, row in enumerate(row_tuple):
                if row == slice(None):
                    continue
                if row not in index.levels[idx]:
                    raise KeyError(row)
        return result
Exemplo n.º 4
0
    def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
        from cudf.utils.cudautils import arange
        from cudf import Series

        # Instructions for Slicing
        # if tuple, get first and last elements of tuple
        # if open beginning tuple, get 0 to highest valid_index
        # if open ending tuple, get highest valid_index to len()
        # if not open end or beginning, get range lowest beginning index
        # to highest ending index
        if isinstance(row_tuple, slice):
            if (
                isinstance(row_tuple.start, numbers.Number)
                or isinstance(row_tuple.stop, numbers.Number)
                or row_tuple == slice(None)
            ):
                stop = row_tuple.stop or max_length
                start, stop, step = row_tuple.indices(stop)
                return arange(start, stop, step)
            start_values = self._compute_validity_mask(
                index, row_tuple.start, max_length
            )
            stop_values = self._compute_validity_mask(
                index, row_tuple.stop, max_length
            )
            return Series(arange(start_values.min(), stop_values.max() + 1))
        elif isinstance(row_tuple, numbers.Number):
            return row_tuple
        return self._compute_validity_mask(index, row_tuple, max_length)
Exemplo n.º 5
0
    def _get_column_major(self, df, row_tuple):
        from cudf import Series
        from cudf import DataFrame

        valid_indices = self._get_valid_indices_by_tuple(
            df.columns, row_tuple, len(df._cols)
        )
        result = df._take_columns(valid_indices)

        if isinstance(row_tuple, (numbers.Number, slice)):
            row_tuple = [row_tuple]
        if len(result) == 0 and len(result.columns) == 0:
            result_columns = df.columns.copy(deep=False)
            clear_codes = DataFrame()
            for name in df.columns.names:
                clear_codes[name] = Series([])
            result_columns._codes = clear_codes
            result_columns._source_data = clear_codes
            result.columns = result_columns
        elif len(row_tuple) < len(self.levels) and (
            not slice(None) in row_tuple
            and not isinstance(row_tuple[0], slice)
        ):
            columns = self._popn(len(row_tuple))
            result.columns = columns.take(valid_indices)
        else:
            result.columns = self.take(valid_indices)
        if len(result.columns.levels) == 1:
            columns = []
            for code in result.columns.codes[result.columns.codes.columns[0]]:
                columns.append(result.columns.levels[0][code])
            name = result.columns.names[0]
            result.columns = as_index(columns, name=name)
        return result
Exemplo n.º 6
0
 def _get_row_major(self, df, row_tuple):
     valid_indices = self._compute_validity_mask(df, row_tuple)
     from cudf import Series
     result = df.take(Series(valid_indices))
     # Build new index - INDEX based MultiIndex
     # ---------------
     from cudf import DataFrame
     out_index = DataFrame()
     # Select the last n-k columns where n is the number of source
     # levels and k is the length of the indexing tuple
     for k in range(len(row_tuple), len(df.index.levels)):
         out_index.add_column(df.index.names[k],
                              df.index.codes[df.index.codes.columns[k]])
     # If there's only one column remaining in the output index, convert
     # it into a StringIndex and name the final index values according
     # to the proper codes.
     if len(out_index.columns) == 1:
         out_index = []
         for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]:  # noqa: E501
             out_index.append(result.index.levels[
                     len(result.index.codes.columns)-1][val])
         # TODO: Warning! The final index column could be arbitrarily
         # ordered integers, not Strings, so we need to check for that
         # dtype and produce a GenericIndex instead of a StringIndex
         out_index = StringIndex(out_index)
         out_index.name = result.index.names[len(result.index.names)-1]
         result.index = out_index
     else:
         # Otherwise pop the leftmost levels, names, and codes from the
         # source index until it has the correct number of columns (n-k)
         if(len(out_index.columns)) > 0:
             result.reset_index(drop=True)
             result.index = result.index._popn(len(row_tuple))
     return result
Exemplo n.º 7
0
 def _apply_op(self, fn, other=None):
     from cudf.dataframe.series import Series
     idx_series = Series(self)
     op = getattr(idx_series, fn)
     if other is not None:
         return as_index(op(other))
     else:
         return as_index(op())
Exemplo n.º 8
0
 def normalize_chunks(self, size, chunks):
     if isinstance(chunks, six.integer_types):
         # *chunks* is the chunksize
         return cudautils.arange(0, size, chunks)
     else:
         # *chunks* is an array of chunk leading offset
         chunks = Series(chunks)
         return chunks.to_gpu_array()
Exemplo n.º 9
0
    def _sortjoin(self, other, how='left', return_indexers=False):
        """Join with another column.

        When the column is a index, set *return_indexers* to obtain
        the indices for shuffling the remaining columns.
        """
        from cudf.dataframe.series import Series

        if not self.is_type_equivalent(other):
            raise TypeError('*other* is not compatible')

        lkey, largsort = self.sort_by_values(True)
        rkey, rargsort = other.sort_by_values(True)
        with _gdf.apply_join(
                [lkey], [rkey], how=how, method='sort') as (lidx, ridx):
            if lidx.size > 0:
                raw_index = cudautils.gather_joined_index(
                        lkey.to_gpu_array(),
                        rkey.to_gpu_array(),
                        lidx,
                        ridx,
                        )
                buf_index = Buffer(raw_index)
            else:
                buf_index = Buffer.null(dtype=self.dtype)

            joined_index = lkey.replace(data=buf_index)

            if return_indexers:
                def gather(idxrange, idx):
                    mask = (Series(idx) != -1).as_mask()
                    return idxrange.take(idx).set_mask(mask).fillna(-1)

                if len(joined_index) > 0:
                    indexers = (
                            gather(Series(largsort), lidx),
                            gather(Series(rargsort), ridx),
                            )
                else:
                    indexers = (
                            Series(Buffer.null(dtype=np.intp)),
                            Series(Buffer.null(dtype=np.intp))
                            )
                return joined_index, indexers
            else:
                return joined_index
Exemplo n.º 10
0
 def wrapper(*args, **kwargs):
     ret = getattr(self._parent._data, attr)(*args, **kwargs)
     if isinstance(ret, nvstrings.nvstrings):
         ret = Series(
             columnops.as_column(ret),
             index=self._index,
             name=self._parent.name,
         )
     return ret
Exemplo n.º 11
0
 def apply_multiindex_or_single_index(self, result):
     if len(result) == 0:
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(self._by) == 1 or len(final_result.columns) == 0:
             dtype = 'float64' if len(self._by) == 1 else 'object'
             name = self._by[0] if len(self._by) == 1 else None
             from cudf.dataframe.index import GenericIndex
             index = GenericIndex(Series([], dtype=dtype))
             index.name = name
             final_result.index = index
         else:
             mi = MultiIndex(source_data=result[self._by])
             mi.names = self._by
             final_result.index = mi
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series([], name=final_result.columns[0])
             final_series.index = final_result.index
             return final_series
         return final_result
     if len(self._by) == 1:
         from cudf.dataframe import index
         idx = index.as_index(result[self._by[0]])
         idx.name = self._by[0]
         result = result.drop(idx.name)
         if idx.name == self._LEVEL_0_INDEX_NAME:
             idx.name = self._original_index_name
         result = result.set_index(idx)
         return result
     else:
         multi_index = MultiIndex(source_data=result[self._by])
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series(final_result[final_result.columns[0]])
             final_series.name = final_result.columns[0]
             final_series.index = multi_index
             return final_series
         return final_result.set_index(multi_index)
Exemplo n.º 12
0
 def codes(self):
     from cudf.dataframe.series import Series
     data = self._parent.data
     if self._parent.has_null_mask:
         mask = self._parent.mask
         null_count = self._parent.null_count
         return Series.from_masked_array(data=data.mem, mask=mask.mem,
                                         null_count=null_count)
     else:
         return Series(data)
Exemplo n.º 13
0
    def _get_row_major(self, df, row_tuple):
        from cudf import Series

        valid_indices = self._get_valid_indices_by_tuple(
            df.index, row_tuple, len(df.index)
        )
        indices = Series(valid_indices)
        result = df.take(indices)
        final = self._index_and_downcast(result, result.index, row_tuple)
        return final
Exemplo n.º 14
0
    def lower(self):
        """
        Convert strings in the Series/Index to lowercase.

        Returns
        -------
        Series/Index of str dtype
            A copy of the object with all strings converted to lowercase.
        """
        from cudf.dataframe import Series
        return Series(self._parent.data.lower(), index=self._index)
Exemplo n.º 15
0
    def _group_dataframe(self, df, levels):
        """Group dataframe.

        The output dataframe has the same number of rows as the input
        dataframe.  The rows are shuffled so that the groups are moved
        together in ascending order based on the multi-level index.

        Parameters
        ----------
        df : DataFrame
        levels : list[str]
            Column names for the multi-level index.

        Returns
        -------
        (df, segs) : namedtuple
            * df : DataFrame
                The grouped dataframe.
            * segs : Series.
                 Group starting index.
        """
        if len(df) == 0:
            # Groupby on empty dataframe
            return _dfsegs_pack(df=df, segs=Buffer(np.asarray([])))
        # Prepare dataframe
        orig_df = df.copy()
        df = df.loc[:, levels].reset_index(drop=True)
        df = df.to_frame() if isinstance(df, Series) else df
        rowid_column = '__cudf.groupby.rowid'
        df[rowid_column] = df.index.as_column()

        col_order = list(levels)

        # Perform grouping
        df, segs, markers = self._group_first_level(col_order[0],
                                                    rowid_column, df)
        rowidcol = df[rowid_column]
        sorted_keys = [Series(df.index.as_column())]
        del df

        more_keys, reordering_indices, segs = self._group_inner_levels(
                                            col_order[1:], rowidcol, segs,
                                            markers=markers)
        sorted_keys.extend(more_keys)
        valcols = [k for k in orig_df.columns if k not in levels]
        # Prepare output
        # All key columns are already sorted
        out_df = DataFrame()
        for k, sr in zip(levels, sorted_keys):
            out_df[k] = sr
        # Shuffle the value columns
        self._group_shuffle(orig_df.loc[:, valcols],
                            reordering_indices, out_df)
        return _dfsegs_pack(df=out_df, segs=segs)
Exemplo n.º 16
0
    def __init__(self, levels=None, codes=None, labels=None, names=None,
                 **kwargs):
        self.names = names
        column_names = []
        if labels:
            warnings.warn("the 'labels' keyword is deprecated, use 'codes' "
                          "instead", FutureWarning)
        if labels and not codes:
            codes = labels

        # early termination enables lazy evaluation of codes
        if 'source_data' in kwargs:
            self._source_data = kwargs['source_data']
            self._codes = codes
            self._levels = levels
            self.names = self._source_data.columns
            return

        # name setup
        if isinstance(names, (Sequence,
                              pd.core.indexes.frozen.FrozenNDArray,
                              pd.core.indexes.frozen.FrozenList)):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError('Must pass non-zero number of levels/codes')

        import cudf
        if not isinstance(codes, cudf.dataframe.dataframe.DataFrame) and\
                not isinstance(codes[0], (Sequence,
                               pd.core.indexes.frozen.FrozenNDArray)):
            raise TypeError('Codes is not a Sequence of sequences')
        if not isinstance(codes, cudf.dataframe.dataframe.DataFrame):
            self._codes = cudf.dataframe.dataframe.DataFrame()
            for idx, code in enumerate(codes):
                code = np.array(code)
                self._codes.add_column(column_names[idx],
                                       columnops.as_column(code))
        else:
            self._codes = codes

        # converting levels to numpy array will produce a Float64Index
        # (on empty levels)for levels mimicking the behavior of Pandas
        self._levels = np.array([Series(level).to_array() for level in levels])
        self._validate_levels_and_codes(self._levels, self._codes)
        self.name = None
        self.names = names
Exemplo n.º 17
0
    def _hashjoin(self, other, how='left', return_indexers=False):

        from cudf.dataframe.series import Series

        if not self.is_type_equivalent(other):
            raise TypeError('*other* is not compatible')

        with _gdf.apply_join(
                [self], [other], how=how, method='hash') as (lidx, ridx):
            if lidx.size > 0:
                raw_index = cudautils.gather_joined_index(
                        self.to_gpu_array(),
                        other.to_gpu_array(),
                        lidx,
                        ridx,
                        )
                buf_index = Buffer(raw_index)
            else:
                buf_index = Buffer.null(dtype=self.dtype)

            joined_index = self.replace(data=buf_index)

            if return_indexers:
                def gather(idxrange, idx):
                    mask = (Series(idx) != -1).as_mask()
                    return idxrange.take(idx).set_mask(mask).fillna(-1)

                if len(joined_index) > 0:
                    indexers = (
                            gather(Series(range(0, len(self))), lidx),
                            gather(Series(range(0, len(other))), ridx),
                            )
                else:
                    indexers = (
                            Series(Buffer.null(dtype=np.intp)),
                            Series(Buffer.null(dtype=np.intp))
                            )
                return joined_index, indexers
            else:
                return joined_index
Exemplo n.º 18
0
    def _group_inner_levels(self, columns, rowidcol, segs, markers):
        """Group the second and onwards level.

        Parameters
        ----------
        columns : sequence[str]
            Group keys.  The order is important.
        rowid_column : str
            The name of the special column with the original rowid.
            It's internally used to determine the shuffling order.
        df : DataFrame
            The dataframe being grouped.
        segs : Series
            First level group begin offsets.

        Returns
        -------
        (sorted_keys, reordering_indices, segments)
            - sorted_keys : list[Series]
                List of sorted key columns.
                Column order is same as arg *columns*.
            - reordering_indices : device array
                The indices to gather on to shuffle the dataframe
                into the grouped seqence.
            - segments : Series
                Group begin offsets.
        """
        dsegs = segs.astype(dtype=np.int32).data.mem
        sorted_keys = []
        plan_cache = {}
        for col in columns:
            # Shuffle the key column according to the previous groups
            srkeys = self._df[col].take(rowidcol.to_gpu_array(),
                                        ignore_index=True)
            # Segmented sort on the key
            shuf = Column(Buffer(cudautils.arange(len(srkeys))))

            cache_key = (len(srkeys), srkeys.dtype, shuf.dtype)
            plan = plan_cache.get(cache_key)
            plan = apply_segsort(srkeys._column, shuf, dsegs, plan=plan)
            plan_cache[cache_key] = plan

            sorted_keys.append(srkeys)  # keep sorted key cols
            # Determine segments
            dsegs, markers = cudautils.find_segments(srkeys.to_gpu_array(),
                                                     dsegs,
                                                     markers=markers)
            # Shuffle
            rowidcol = rowidcol.take(shuf.to_gpu_array(), ignore_index=True)

        reordering_indices = rowidcol.to_gpu_array()
        return sorted_keys, reordering_indices, Series(dsegs)
Exemplo n.º 19
0
 def _to_frame(self):
     from cudf import DataFrame
     # for each column of codes
     # replace column with mapping from integers to levels
     df = self.codes.copy(deep=False)
     for idx, column in enumerate(df.columns):
         # use merge as a replace fn
         level = DataFrame({'idx': Series(cudautils.arange(len(
                                                     self.levels[idx]),
                                          dtype=df[column].dtype)),
                            'level': self.levels[idx]})
         code = DataFrame({'idx': df[column]})
         df[column] = code.merge(level).level
     return df
Exemplo n.º 20
0
 def apply(self, method):
     gpu_out = numba.cuda.device_array_like(self.gpu_in)
     kernel = get_ewm_kernel(method)
     kernel[(self.number_of_blocks,),
            (self.number_of_threads,),
            0,
            self.shared_buffer_size * 8](self.gpu_in,
                                         gpu_out,
                                         self.window,
                                         self.span,
                                         self.array_len,
                                         self.thread_tile,
                                         self.min_periods)
     return Series(gpu_out)
Exemplo n.º 21
0
    def _compute_levels_and_codes(self):
        levels = []
        from cudf import DataFrame
        codes = DataFrame()
        names = []
        # Note: This is an O(N^2) solution using gpu masking
        # to compute new codes for the MultiIndex. There may be
        # a faster solution that could be executed on gpu at the same
        # time the groupby is calculated.
        for by in self._source_data.columns:
            if len(self._source_data[by]) > 0:
                level = self._source_data[by].unique()
                replaced = self._source_data[by].replace(
                        level, Series(range(len(level))))
            else:
                level = np.array([])
                replaced = np.array([])
            levels.append(level)
            codes[by] = Series(replaced, dtype="int32")
            names.append(by)

        self._levels = levels
        self._codes = codes
        self.names = names
Exemplo n.º 22
0
    def cat(self, others=None, sep=None, na_rep=None):
        """
        Concatenate strings in the Series/Index with given separator.

        If *others* is specified, this function concatenates the Series/Index
        and elements of others element-wise. If others is not passed, then all
        values in the Series/Index are concatenated into a single string with
        a given sep.

        Parameters
        ----------
            others : Series or List of str
                Strings to be appended.
                The number of strings must match size() of this instance.
                This must be either a Series of string dtype or a Python
                list of strings.

            sep : str
                If specified, this separator will be appended to each string
                before appending the others.

            na_rep : str
                This character will take the place of any null strings
                (not empty strings) in either list.

                - If `na_rep` is None, and `others` is None, missing values in
                the Series/Index are omitted from the result.
                - If `na_rep` is None, and `others` is not None, a row
                containing a missing value in any of the columns (before
                concatenation) will have a missing value in the result.

        Returns
        -------
        concat : str or Series/Index of str dtype
            If `others` is None, `str` is returned, otherwise a `Series/Index`
            (same type as caller) of str dtype is returned.
        """
        from cudf.dataframe import Series, Index
        if isinstance(others, (Series, Index)):
            assert others.dtype == np.dtype('object')
            others = others.data
        out = Series(self._parent.data.cat(others=others,
                                           sep=sep,
                                           na_rep=na_rep),
                     index=self._index)
        if len(out) == 1 and others is None:
            out = out[0]
        return out
Exemplo n.º 23
0
    def __getattr__(self, attr, *args, **kwargs):
        from cudf.dataframe.series import Series
        if hasattr(self._parent._data, attr):
            passed_attr = getattr(self._parent._data, attr)
            if callable(passed_attr):

                def wrapper(*args, **kwargs):
                    return getattr(self._parent._data, attr)(*args, **kwargs)

                if isinstance(wrapper, nvstrings.nvstrings):
                    wrapper = Series(columnops.as_column(wrapper),
                                     index=self._index)
                return wrapper
            else:
                return passed_attr
        else:
            raise AttributeError(attr)
Exemplo n.º 24
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.
    """
    try:
        res, valids = cpp_dlpack.from_dlpack(pycapsule_obj)
    except GDFError as err:
        if str(err) == "b'GDF_DATASET_EMPTY'":
            raise ValueError(
                "Cannot create a cuDF Object from a DLPack tensor of 0 size")
        else:
            raise err
    cols = []
    for idx in range(len(valids)):
        mask = None
        if valids[idx]:
            mask = Buffer(valids[idx])
        cols.append(
            columnops.build_column(Buffer(res[idx]),
                                   dtype=res[idx].dtype,
                                   mask=mask))
    if len(cols) == 1:
        return Series(cols[0])
    else:
        df = DataFrame()
        for idx, col in enumerate(cols):
            df[idx] = col
        return df
Exemplo n.º 25
0
Arquivo: string.py Projeto: ziiin/cudf
    def extract(self, pat, flags=0, expand=True):
        """
        Extract capture groups in the regex `pat` as columns in a DataFrame.

        For each subject string in the Series, extract groups from the first
        match of regular expression `pat`.

        Parameters
        ----------
        pat : str
            Regular expression pattern with capturing groups.
        expand : bool, default True
            If True, return DataFrame with on column per capture group.
            If False, return a Series/Index if there is one capture group or
            DataFrame if there are multiple capture groups.

        Returns
        -------
        DataFrame or Series/Index
            A DataFrame with one row for each subject string, and one column
            for each group. If `expand=False` and `pat` has only one capture
            group, then return a Series/Index.

        Notes
        -----
        The `flags` parameter is not yet supported and will raise a
        NotImplementedError if anything other than the default value is passed.
        """
        if flags != 0:
            raise NotImplementedError("`flags` parameter is not yet supported")

        from cudf.dataframe import DataFrame, Series
        out = self._parent.data.extract(pat)
        if len(out) == 1 and expand is False:
            return Series(
                out[0],
                index=self._index
            )
        else:
            out_df = DataFrame(index=self._index)
            for idx, val in enumerate(out):
                out_df[idx] = val
            return out_df
Exemplo n.º 26
0
    def searchsorted(self, value, side="left"):
        """Find indices where elements should be inserted to maintain order

        Parameters
        ----------
        value : Column
            Column of values to search for
        side : str {‘left’, ‘right’} optional
            If ‘left’, the index of the first suitable location found is given.
            If ‘right’, return the last such index

        Returns
        -------
        An index series of insertion points with the same shape as value
        """
        from cudf.dataframe.series import Series

        idx_series = Series(self, name=self.name)
        result = idx_series.searchsorted(value, side)
        return as_index(result)
Exemplo n.º 27
0
    def len(self):
        """
        Computes the length of each element in the Series/Index.

        Returns
        -------
          Series or Index of int: A Series or Index of integer values
            indicating the length of each element in the Series or Index.
        """
        from cudf.dataframe.series import Series
        out_dev_arr = rmm.device_array(len(self._parent), dtype='int32')
        ptr = get_ctype_ptr(out_dev_arr)
        self._parent.data.len(ptr)

        mask = None
        if self._parent.null_count > 0:
            mask = self._parent.mask

        column = columnops.build_column(Buffer(out_dev_arr),
                                        np.dtype('int32'),
                                        mask=mask)
        return Series(column, index=self._index)
Exemplo n.º 28
0
    def _group_first_level(self, col, rowid_column, df):
        """Group first level *col* of *df*

        Parameters
        ----------
        col : str
            Name of the first group key column.
        df : DataFrame
            The dataframe being grouped.

        Returns
        -------
        (df, segs)
            - df : DataFrame
                Sorted by *col- * index
            - segs : Series
                Group begin offsets
        """
        df = df.loc[:, [col, rowid_column]]
        df = df.set_index(col).sort_index()
        segs, markers = df.index._find_segments()
        return df, Series(segs), markers
Exemplo n.º 29
0
def read_csv_strings(filepath_or_buffer, lineterminator='\n',
                     quotechar='"', quoting=True, doublequote=True,
                     sep=',', delimiter=None, delim_whitespace=False,
                     skipinitialspace=False, names=None, dtype=None,
                     skipfooter=0, skiprows=0, dayfirst=False,
                     compression='infer', thousands=None, decimal='.',
                     true_values=None, false_values=None, nrows=None):

    """
    **Experimental**: This function exists only as a beta way to use
    `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_. with cudf.

    Future versions of cuDF will provide cleaner integration.

    Uses mostly same arguments as read_csv.
    Note: Doesn't currently support auto-column detection, header, usecols
    and mangle_dupe_cols args.

    Returns
    -------
    columns : ordered list of cudf.dataframe.Series and nvstrings objects
      numeric or date dtyped columns will be Series.

      'str' dtyped columns will be
      `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_.

    Examples
    --------

    .. code-block:: python

      import cudf

      # Create a test csv file
      filename = 'foo.csv'
      lines = [
        "num1,datetime,text",
        "123,2018-11-13T12:00:00,abc",
        "456,2018-11-14T12:35:01,def",
        "789,2018-11-15T18:02:59,ghi"
      ]
      with open(filename, 'w') as fp:
          fp.write('\\n'.join(lines)+'\\n')

      # Read the file with cudf
      names = ['num1', 'datetime', 'text']
      dtypes = ['int', 'date', 'str']
      columns = cudf.io.csv.read_csv_strings(filename, delimiter=',',
                              names=names, dtype=dtypes,
                              skiprows=1)
      # Display results
      columns[0]
      print(columns[0])
      columns[2]
      print(columns[2])

    Output:

    .. code-block:: python

      <cudf.Series nrows=3 >
      0  123
      1  456
      2  789

      <nvstrings count=3>
      ['abc', 'def', 'ghi']

    See Also
    --------
    .read_csv
    """
    import nvstrings
    from cudf.dataframe.series import Series

    if names is None or dtype is None:
        msg = '''Automatic dtype detection not implemented:
        Column names and dtypes must be specified.'''
        raise TypeError(msg)

    # Alias sep -> delimiter.
    if delimiter is None:
        delimiter = sep

    if isinstance(dtype, dict):
        dtype_dict = True
    elif isinstance(dtype, list):
        dtype_dict = False
        if len(dtype) != len(names):
            msg = '''All column dtypes must be specified.'''
            raise TypeError(msg)
    else:
        msg = '''dtype must be 'list' or 'dict' '''
        raise TypeError(msg)

    csv_reader = ffi.new('csv_read_arg*')

    # Populate csv_reader struct
    if is_file_like(filepath_or_buffer):
        buffer = filepath_or_buffer.read()
        # check if StringIO is used
        if hasattr(buffer, 'encode'):
            buffer_as_bytes = buffer.encode()
        else:
            buffer_as_bytes = buffer
        buffer_data_holder = ffi.new("char[]", buffer_as_bytes)

        csv_reader.input_data_form = libgdf.HOST_BUFFER
        csv_reader.filepath_or_buffer = buffer_data_holder
        csv_reader.buffer_size = len(buffer_as_bytes)
    else:
        file_path = _wrap_string(filepath_or_buffer)

        csv_reader.input_data_form = libgdf.FILE_PATH
        csv_reader.filepath_or_buffer = file_path

    arr_names = []
    arr_dtypes = []
    for col_name in names:
        arr_names.append(_wrap_string(col_name))
        if dtype_dict:
            arr_dtypes.append(_wrap_string(str(dtype[col_name])))
    names_ptr = ffi.new('char*[]', arr_names)
    csv_reader.names = names_ptr

    if not dtype_dict:
        for col_dtype in dtype:
            arr_dtypes.append(_wrap_string(str(col_dtype)))
    dtype_ptr = ffi.new('char*[]', arr_dtypes)
    csv_reader.dtype = dtype_ptr

    if decimal == delimiter:
        raise ValueError("decimal cannot be the same as delimiter")

    if thousands == delimiter:
        raise ValueError("thousands cannot be the same as delimiter")

    if nrows is not None and skipfooter != 0:
        raise ValueError("cannot use both nrows and skipfooter parameters")

    # Start with default values recognized as boolean
    arr_true_values = [_wrap_string(str('True')), _wrap_string(str('TRUE'))]
    arr_false_values = [_wrap_string(str('False')), _wrap_string(str('FALSE'))]

    for value in true_values or []:
        arr_true_values.append(_wrap_string(str(value)))
    arr_true_values_ptr = ffi.new('char*[]', arr_true_values)
    csv_reader.true_values = arr_true_values_ptr
    csv_reader.num_true_values = len(arr_true_values)

    for value in false_values or []:
        arr_false_values.append(_wrap_string(str(value)))
    false_values_ptr = ffi.new('char*[]', arr_false_values)
    csv_reader.false_values = false_values_ptr
    csv_reader.num_false_values = len(arr_false_values)

    compression_bytes = _wrap_string(compression)

    csv_reader.delimiter = delimiter.encode()
    csv_reader.lineterminator = lineterminator.encode()
    csv_reader.quotechar = quotechar.encode()
    csv_reader.quoting = quoting
    csv_reader.doublequote = doublequote
    csv_reader.delim_whitespace = delim_whitespace
    csv_reader.skipinitialspace = skipinitialspace
    csv_reader.dayfirst = dayfirst
    csv_reader.num_cols = len(names)
    csv_reader.skiprows = skiprows
    csv_reader.skipfooter = skipfooter
    csv_reader.compression = compression_bytes
    csv_reader.decimal = decimal.encode()
    csv_reader.thousands = thousands.encode() if thousands else b'\0'
    csv_reader.nrows = nrows if nrows is not None else -1

    # Call read_csv
    libgdf.read_csv(csv_reader)

    out = csv_reader.data
    if out == ffi.NULL:
        raise ValueError("Failed to parse CSV")

    # Extract parsed columns

    outcols = []
    for i in range(csv_reader.num_cols_out):
        if out[i].dtype == libgdf.GDF_STRING:
            ptr = int(ffi.cast("uintptr_t", out[i].data))
            outcols.append(nvstrings.bind_cpointer(ptr))
        else:
            newcol = Column.from_cffi_view(out[i])
            if(newcol.dtype == np.dtype('datetime64[ms]')):
                col = newcol.view(DatetimeColumn, dtype='datetime64[ms]')
            else:
                col = newcol.view(NumericalColumn, dtype=newcol.dtype)
            outcols.append(Series(col))

    return outcols
Exemplo n.º 30
0
    def cat(self, others=None, sep=None, na_rep=None):
        """
        Concatenate strings in the Series/Index with given separator.

        If *others* is specified, this function concatenates the Series/Index
        and elements of others element-wise. If others is not passed, then all
        values in the Series/Index are concatenated into a single string with
        a given sep.

        Parameters
        ----------
            others : Series or List of str
                Strings to be appended.
                The number of strings must match size() of this instance.
                This must be either a Series of string dtype or a Python
                list of strings.

            sep : str
                If specified, this separator will be appended to each string
                before appending the others.

            na_rep : str
                This character will take the place of any null strings
                (not empty strings) in either list.

                - If `na_rep` is None, and `others` is None, missing values in
                the Series/Index are omitted from the result.
                - If `na_rep` is None, and `others` is not None, a row
                containing a missing value in any of the columns (before
                concatenation) will have a missing value in the result.

        Returns
        -------
        concat : str or Series/Index of str dtype
            If `others` is None, `str` is returned, otherwise a `Series/Index`
            (same type as caller) of str dtype is returned.
        """
        from cudf.dataframe import Series, Index

        if isinstance(others, (Series, Index)):
            '''
            If others is just another Series/Index,
            great go ahead with concatenation
            '''
            assert others.dtype == np.dtype('object')
            others = others.data
        elif utils.is_list_like(others) and others:
            '''
            If others is a list-like object (in our case lists & tuples)
            just another Series/Index, great go ahead with concatenation.
            '''
            '''
            Picking first element and checking if it really adheres to
            list like conditions, if not we switch to next case

            Note: We have made a call not to iterate over the entire list as
            it could be more expensive if it was of very large size.
            Thus only doing a sanity check on just the first element of list.
            '''
            first = others[0]

            if utils.is_list_like(first) or \
                    isinstance(first, (Series, Index, pd.Series, pd.Index)):
                '''
                Internal elements in others list should also be
                list-like and not a regular string/byte
                '''
                first = None
                for frame in others:
                    if not isinstance(frame, (Series, Index)):
                        '''
                        Make sure all inputs to .cat function call
                        are of type nvstrings so creating a Series object.
                        '''
                        frame = Series(frame, dtype='str')

                    if (first is None):
                        '''
                        extracting nvstrings pointer since
                        `frame` is of type Series/Index and
                        first isn't yet initialized.
                        '''
                        first = frame.data
                    else:
                        assert frame.dtype == np.dtype('object')
                        frame = frame.data
                        first = first.cat(frame, sep=sep, na_rep=na_rep)

                others = first
            elif not utils.is_list_like(first):
                '''
                Picking first element and checking if it really adheres to
                non-list like conditions.

                Note: We have made a call not to iterate over the entire
                list as it could be more expensive if it was of very
                large size. Thus only doing a sanity check on just the
                first element of list.
                '''
                others = Series(others)
                others = others.data
        elif isinstance(others, (pd.Series, pd.Index)):
            others = Series(others)
            others = others.data

        out = Series(self._parent.data.cat(others=others,
                                           sep=sep,
                                           na_rep=na_rep),
                     index=self._index)
        if len(out) == 1 and others is None:
            out = out[0]
        return out