Пример #1
0
 def _apply_columns_list(self, collist, colsdesc):
     n = len(colsdesc)
     nn = len(collist)
     if n != nn:
         raise TValueError("Input contains %s, whereas `columns` "
                           "parameter specifies only %s"
                           % (plural(n, "column"), plural(nn, "column")))
     colnames = []
     coltypes = [rtype.rdrop.value] * n
     for i in range(n):
         entry = collist[i]
         if entry is None or entry is False:
             pass
         elif entry is True or entry is Ellipsis:
             colnames.append(colsdesc[i].name)
             coltypes[i] = rtype.rauto.value
         elif isinstance(entry, str):
             colnames.append(entry)
             coltypes[i] = rtype.rauto.value
         elif isinstance(entry, (stype, ltype, type)):
             colnames.append(colsdesc[i].name)
             coltypes[i] = _rtypes_map[entry].value
         elif isinstance(entry, tuple):
             newname, newtype = entry
             if newtype not in _rtypes_map:
                 raise TValueError("Unknown type %r used as an override "
                                   "for column %r" % (newtype, newname))
             colnames.append(newname)
             coltypes[i] = _rtypes_map[newtype].value
         else:
             raise TTypeError("Entry `columns[%d]` has invalid type %r"
                              % (i, entry.__class__.__name__))
     self._colnames = colnames
     return coltypes
Пример #2
0
    def colindex(self, name):
        """
        Return index of the column ``name``.

        :param name: name of the column to find the index for. This can also
            be an index of a column, in which case the index is checked that
            it doesn't go out-of-bounds, and negative index is converted into
            positive.
        :raises ValueError: if the requested column does not exist.
        """
        if isinstance(name, str):
            if name in self._inames:
                return self._inames[name]
            else:
                raise TValueError("Column `%s` does not exist in %r" %
                                  (name, self))
        else:
            n = self._ncols
            if 0 <= name < n:
                return name
            elif -n <= name < 0:
                return name + n
            else:
                raise TValueError("Column index `%d` is invalid for a "
                                  "datatable with %s" %
                                  (name, plural(n, "column")))
Пример #3
0
def process_column(col, df):
    """
    Helper function to verify the validity of a single column selector.

    Given frame `df` and a column description `col`, this function returns:
      * either the numeric index of the column
      * a numeric slice, as a triple (start, count, step)
      * or a `BaseExpr` object
    """
    if isinstance(col, int):
        ncols = df.ncols
        if -ncols <= col < ncols:
            return col % ncols
        else:
            raise TValueError(
                "Column index `{col}` is invalid for a frame with {ncolumns}".
                format(col=col, ncolumns=plural(ncols, "column")))

    if isinstance(col, str):
        # This raises an exception if `col` cannot be found in the dataframe
        return df.colindex(col)

    if isinstance(col, slice):
        start = col.start
        stop = col.stop
        step = col.step
        if isinstance(start, str) or isinstance(stop, str):
            col0 = None
            col1 = None
            if start is None:
                col0 = 0
            elif isinstance(start, str):
                col0 = df.colindex(start)
            if stop is None:
                col1 = df.ncols - 1
            elif isinstance(stop, str):
                col1 = df.colindex(stop)
            if col0 is None or col1 is None:
                raise TValueError(
                    "Slice %r is invalid: cannot mix numeric and "
                    "string column names" % col)
            if step is not None:
                raise TValueError("Column name slices cannot use strides: %r" %
                                  col)
            return (col0, abs(col1 - col0) + 1, 1 if col1 >= col0 else -1)
        elif all(x is None or isinstance(x, int) for x in (start, stop, step)):
            return normalize_slice(col, df.ncols)
        else:
            raise TValueError("%r is not integer-valued" % col)

    if isinstance(col, ColSelectorExpr):
        col.resolve()
        return col.col_index

    if isinstance(col, BaseExpr):
        return col

    raise TTypeError("Unknown column selector: %r" % col)
Пример #4
0
    def sort_columns(self, frame):
        if frame.ncols == 0:
            return
        ncols_sort = min(int(random.expovariate(1.0)) + 1, frame.ncols)
        a = random.sample(range(0, frame.ncols), ncols_sort)

        print("[10] Sorting %s in ascending order: %r" %
              (plural(len(a), "column"), a))
        if python_output:
            python_output.write("DT = DT.sort(%r)\n" % a)
        frame.sort_columns(a)
Пример #5
0
    def set_key_columns(self, frame):
        if frame.ncols == 0:
            return

        nkeys = min(int(random.expovariate(1.0)) + 1, frame.ncols)
        keys = random.sample(range(0, frame.ncols), nkeys)
        names = [frame.names[i] for i in keys]

        print("[13] Setting %s: %r" % (plural(nkeys, "key column"), keys))

        res = frame.set_key_columns(keys, names)
        if python_output:
            if res:
                python_output.write("DT.key = %r\n" % names)
            else:
                python_output.write("with pytest.raises(ValueError, "
                                    "match='Cannot set a key: the values are "
                                    "not unique'):\n"
                                    "    DT.key = %r\n\n" % names)
Пример #6
0
    def rename(self, columns: Union[Dict[str, str], Dict[int, str], List[str],
                                    Tuple[str, ...]]):
        """
        Rename columns of the datatable.

        :param columns: dictionary of the {old_name: new_name} entries.
        :returns: None
        """
        if isinstance(columns, (list, tuple)):
            names = columns
            if len(names) != self._ncols:
                raise TValueError("Cannot rename columns to %r: expected %s" %
                                  (names, plural(self._ncols, "name")))
        else:
            names = list(self._names)
            for oldname, newname in columns.items():
                idx = self.colindex(oldname)
                names[idx] = newname
        self._fill_from_dt(self._dt, names=names)
Пример #7
0
def _rbind(self, *frames, force=False, bynames=True):
    """
    Append rows of `frames` to the current Frame.

    This is equivalent to `list.extend()` in Python: the Frames are combined
    by rows, i.e. rbinding a Frame of shape [n x k] to a Frame of shape
    [m x k] produces a Frame of shape [(m + n) x k].

    This method modifies the current Frame in-place. If you do not want
    the current Frame modified, then append all Frames to an empty Frame:
    `dt.Frame().rbind(frame1, frame2)`.

    If Frame(s) being appended have columns of types different from the
    current Frame, then these columns will be promoted to the largest of two
    types: bool -> int -> float -> string.

    If you need to append multiple Frames, then it is more efficient to
    collect them into an array first and then do a single `rbind()`, than it is
    to append them one-by-one.

    Appending data to a Frame opened from disk will force loading the
    current Frame into memory, which may fail with an OutOfMemory exception.

    Parameters
    ----------
    frames: sequence or list of Frames
        One or more Frame to append. These Frames should have the same
        columnar structure as the current Frame (unless option `force` is
        used).

    force: boolean, default False
        If True, then the Frames are allowed to have mismatching set of
        columns. Any gaps in the data will be filled with NAs.

    bynames: boolean, default True
        If True, the columns in Frames are matched by their names. For
        example, if one Frame has columns ["colA", "colB", "colC"] and the
        other ["colB", "colA", "colC"] then we will swap the order of the first
        two columns of the appended Frame before performing the append.
        However if `bynames` is False, then the column names will be ignored,
        and the columns will be matched according to their order, i.e. i-th
        column in the current Frame to the i-th column in each appended
        Frame.
    """
    n = self.ncols

    # `spec` will be the description of how the DataTables are to be merged:
    # it is a list of tuples (core.DataTable, Optional[List[int]]), where the
    # first item in the tuple is a Frame being appended, and the second item
    # is the array of column indices within that Frame. For example, if the
    # array is [1, 0, None, 2, None] then it means that we need to take the
    # Frame being appended, reorder its columns as (2nd column, 1st column,
    # column of NAs, 3rd column, column of NAs) and only then "stitch" to the
    # resulting Frame of 5 columns.
    spec = []
    final_names = list(self.names)

    # Append by column names, filling with NAs as necessary
    if bynames:
        # `inames` is a mapping of column_name => column_index.
        inames = {}
        for i, col in enumerate(final_names):
            inames[col] = i
        for df in frames:
            _dt = df.internal
            if df.nrows == 0: continue
            if n == 0:
                n = df.ncols
                final_names = list(df.names)
                for i, col in enumerate(df.names):
                    inames[col] = i
            elif not (df.ncols == n or force):
                raise TValueError(
                    "Cannot rbind frame with %s to a frame with %s. If"
                    " you wish to rbind the frames anyways, filling missing "
                    "values with NAs, then use `force=True`"
                    % (plural(df.ncols, "column"), plural(n, "column")))
            if final_names == list(df.names):
                spec.append((_dt, None))
                continue
            # Column mapping that specifies which column of `df` should be
            # appended where in the result.
            res = [None] * len(final_names)
            for i, col in enumerate(df.names):
                icol = inames.get(col)
                if icol is not None:
                    res[icol] = i
                elif force:
                    final_names.append(col)
                    inames[col] = len(final_names) - 1
                    res.append(i)
                    n += 1
                else:
                    raise TValueError(
                        "Column `%s` is not found in the source frame. "
                        "If you want to rbind the frames anyways filling "
                        "missing values with NAs, then use `force=True`"
                        % col)
            spec.append((_dt, res))

    # Append by column numbers
    else:
        for df in frames:
            _dt = df.internal
            if df.nrows == 0: continue
            if n == 0:
                n = df.ncols
                final_names = list(df.names)
            if df.ncols != n:
                if not force:
                    raise TValueError(
                        "Cannot rbind frame with %s to a frame with %s. If you "
                        "wish to rbind the Frames anyways filling missing "
                        "values with NAs, then use option `force=True`"
                        % (plural(df.ncols, "column"), plural(n, "column")))
                elif df.ncols > n:
                    final_names += list(df.names[n:])
                    n = df.ncols
            spec.append((_dt, None))

    # Perform the append operation on C level
    _dt = self.internal
    _dt.rbind(len(final_names), spec)
    self.names = final_names
    return self
Пример #8
0
def _cbind(self, *frames, force=False, inplace=True):
    """
    Append columns of Frames `frames` to the current Frame.

    This is equivalent to `pandas.concat(axis=1)`: the Frames are combined
    by columns, i.e. cbinding a Frame of shape [n x m] to a Frame of
    shape [n x k] produces a Frame of shape [n x (m + k)].

    As a special case, if you cbind a single-row Frame, then that row will
    be replicated as many times as there are rows in the current Frame. This
    makes it easy to create constant columns, or to append reduction results
    (such as min/max/mean/etc) to the current Frame.

    If Frame(s) being appended have different number of rows (with the
    exception of Frames having 1 row), then the operation will fail by
    default. You can force cbinding these Frames anyways by providing option
    `force=True`: this will fill all "short" Frames with NAs. Thus there is
    a difference in how Frames with 1 row are treated compared to Frames
    with any other number of rows.

    Parameters
    ----------
    frames: sequence or list of Frames
        One or more Frame to append. They should have the same number of
        rows (unless option `force` is also used).

    force: boolean, default False
        If True, allows Frames to be appended even if they have unequal
        number of rows. The resulting Frame will have number of rows equal
        to the largest among all Frames. Those Frames which have less
        than the largest number of rows, will be padded with NAs (with the
        exception of Frames having just 1 row, which will be replicated
        instead of filling with NAs).

    inplace: boolean, default True [DEPRECATED]
        If True, then the data is appended to the current Frame in-place,
        causing it to be modified. If False, then a new Frame will be
        constructed and returned instead (and no existing Frames will be
        modified).

    Returns
    -------
    The current Frame, modified, if `inplace` is True; or a new Frame
    containing all Frames concatenated, if `inplace` is False.
    """
    datatables = []

    # Which Frame to operate upon. If not `inplace` then we will create
    # a blank Frame and merge everything to it.
    src = self
    if not inplace:
        src = dt.Frame()
        datatables.append(self.internal)

    # Check that all Frames have compatible number of rows, and compose the
    # list of _DataTables to be passed down into the C level.
    nrows = src.nrows or -1
    for df in frames:
        if df.ncols == 0: continue
        nn = df.nrows
        if nrows == -1:
            nrows = nn
        if not(nn == nrows or nn == 1 or force):
            if nrows <= 1:
                nrows = nn
            else:
                raise TValueError(
                    "Cannot merge Frame with %s to a Frame with %s. If "
                    "you want to disregard this warning and merge Frames "
                    "anyways, then use option `force=True`"
                    % (plural(nn, "row"), plural(nrows, "row")))
        datatables.append(df.internal)

    _dt = src.internal
    _dt.cbind(datatables)
    return src
Пример #9
0
 def __repr__(self):
     srows = plural(self.nrows, "row")
     scols = plural(self.ncols, "col")
     return "<Frame [%s x %s]>" % (srows, scols)
Пример #10
0
def make_rowfilter(rows, ee, _nested=False) -> RFNode:
    """
    Create an :class:`RFNode` from the provided expression.

    This is a factory function that instantiates an appropriate subclass of
    :class:`RFNode`, depending on the provided argument `rows`.

    Parameters
    ----------
    rows:
        An expression that will be converted into one of the RFNodes. This can
        have a variety of different types, see `help(Frame.__call__)` for
        more information.

    ee: EvaluationEngine
        The evaluation context within which the expression should be computed.

    _nested: bool, default False
        Internal attribute, used to avoid deep recursion when `make_rowfilter()`
        calls itself. When this attribute is False recursion is allowed,
        otherwise not.
    """
    nrows = ee.dt.nrows
    if rows is Ellipsis or rows is None:
        return AllRFNode(ee)

    if rows is True or rows is False:
        # Note: True/False are integer objects in Python
        raise TTypeError("Boolean value cannot be used as a `rows` selector")

    if isinstance(rows, (int, slice, range)):
        rows = [rows]

    from_generator = False
    if isinstance(rows, types.GeneratorType):
        # If an iterator is given, materialize it first. Otherwise there
        # is no way to ensure that the produced indices are valid.
        rows = list(rows)
        from_generator = True

    if isinstance(rows, (list, tuple, set)):
        bases = []
        counts = []
        steps = []
        for i, elem in enumerate(rows):
            if isinstance(elem, int):
                if -nrows <= elem < nrows:
                    # `elem % nrows` forces the row number to become positive
                    bases.append(elem % nrows)
                else:
                    raise TValueError(
                        "Row `%d` is invalid for datatable with %s" %
                        (elem, plural(nrows, "row")))
            elif isinstance(elem, (range, slice)):
                if elem.step == 0:
                    raise TValueError("In %r step must not be 0" % elem)
                if not all(x is None or isinstance(x, int)
                           for x in (elem.start, elem.stop, elem.step)):
                    raise TValueError("%r is not integer-valued" % elem)
                if isinstance(elem, range):
                    res = normalize_range(elem, nrows)
                    if res is None:
                        raise TValueError(
                            "Invalid %r for a datatable with %s" %
                            (elem, plural(nrows, "row")))
                else:
                    res = normalize_slice(elem, nrows)
                start, count, step = res
                assert count >= 0
                if count == 0:
                    pass  # don't do anything
                elif count == 1:
                    bases.append(start)
                else:
                    if len(counts) < len(bases):
                        counts += [1] * (len(bases) - len(counts))
                        steps += [1] * (len(bases) - len(steps))
                    bases.append(start)
                    counts.append(count)
                    steps.append(step)
            else:
                if from_generator:
                    raise TValueError(
                        "Invalid row selector %r generated at position %d" %
                        (elem, i))
                else:
                    raise TValueError(
                        "Invalid row selector %r at element %d of the "
                        "`rows` list" % (elem, i))
        if not counts:
            if len(bases) == 1:
                if bases[0] == 0 and nrows == 1:
                    return AllRFNode(ee)
                return SliceRFNode(ee, bases[0], 1, 1)
            else:
                return ArrayRFNode(ee, bases)
        elif len(bases) == 1:
            if bases[0] == 0 and counts[0] == nrows and steps[0] == 1:
                return AllRFNode(ee)
            else:
                return SliceRFNode(ee, bases[0], counts[0], steps[0])
        else:
            return MultiSliceRFNode(ee, bases, counts, steps)

    if is_type(rows, NumpyArray_t):
        arr = rows
        if not (len(arr.shape) == 1
                or len(arr.shape) == 2 and min(arr.shape) == 1):
            raise TValueError(
                "Only a single-dimensional numpy.array is allowed"
                " as a `rows` argument, got %r" % arr)
        if len(arr.shape) == 2 and arr.shape[1] > 1:
            arr = arr.T
        if not (str(arr.dtype) == "bool" or str(arr.dtype).startswith("int")):
            raise TValueError("Either a boolean or an integer numpy.array is "
                              "expected for `rows` argument, got %r" % arr)
        if str(arr.dtype) == "bool" and arr.shape[-1] != nrows:
            raise TValueError("Cannot apply a boolean numpy array of length "
                              "%d to a datatable with %s" %
                              (arr.shape[-1], plural(nrows, "row")))
        rows = datatable.Frame(arr)
        assert rows.ncols == 1
        assert rows.ltypes[0] == ltype.bool or rows.ltypes[0] == ltype.int

    if is_type(rows, Frame_t):
        if rows.ncols != 1:
            raise TValueError("`rows` argument should be a single-column "
                              "datatable, got %r" % rows)
        col0type = rows.ltypes[0]
        if col0type == ltype.bool:
            if rows.nrows != nrows:
                s1rows = plural(rows.nrows, "row")
                s2rows = plural(nrows, "row")
                raise TValueError("`rows` datatable has %s, but applied to a "
                                  "datatable with %s" % (s1rows, s2rows))
            return BooleanColumnRFNode(ee, rows)
        elif col0type == ltype.int:
            return IntegerColumnRFNode(ee, rows)
        else:
            raise TTypeError("`rows` datatable should be either a boolean or "
                             "an integer column, however it has type %s" %
                             col0type)

    if isinstance(rows, types.FunctionType):
        return make_rowfilter(rows(f), ee, _nested=True)

    if isinstance(rows, BaseExpr):
        return FilterExprRFNode(ee, rows)

    if _nested:
        raise TTypeError("Unexpected result produced by the `rows` "
                         "function: %r" % (rows, ))
    else:
        raise TTypeError("Unexpected `rows` argument: %r" % (rows, ))
Пример #11
0
 def __repr__(self):
     srows = plural(self._nrows, "row")
     scols = plural(self._ncols, "col")
     return "<Frame #%d (%s x %s)>" % (self._id, srows, scols)
Пример #12
0
    def _override_columns(self, colnames, coltypes):
        assert len(colnames) == len(coltypes)
        n = len(colnames)
        colspec = self._columns
        self._colnames = []

        if colspec is None:
            self._colnames = colnames
            return

        if isinstance(colspec, (slice, range)):
            if isinstance(colspec, slice):
                start, count, step = normalize_slice(colspec, n)
            else:
                t = normalize_range(colspec, n)
                if t is None:
                    raise TValueError("Invalid range iterator for a file with "
                                      "%d columns: %r" % (n, colspec))
                start, count, step = t
            if step <= 0:
                raise TValueError("Cannot use slice/range with negative step "
                                  "for column filter: %r" % colspec)
            for i in range(n):
                if (i - start) % step == 0 and i < start + count * step:
                    self._colnames.append(colnames[i])
                else:
                    coltypes[i] = 0
            return

        if isinstance(colspec, set):
            # Make a copy of the `colspec`, in order to check whether all the
            # columns requested by the user were found, and issue a warning
            # otherwise.
            colsfound = set(colspec)
            for i in range(n):
                if colnames[i] in colspec:
                    if colnames[i] in colsfound:
                        colsfound.remove(colnames[i])
                    self._colnames.append(colnames[i])
                else:
                    coltypes[i] = 0
            if colsfound:
                self.logger.warning(
                    "Column(s) %r not found in the input file" %
                    list(colsfound))
            return

        if isinstance(colspec, (list, tuple)):
            nn = len(colspec)
            if n != nn:
                raise TValueError("Input file contains %s, whereas `columns` "
                                  "parameter specifies only %s" %
                                  (plural(n, "column"), plural(nn, "column")))
            for i in range(n):
                entry = colspec[i]
                if entry is None:
                    coltypes[i] = 0
                elif isinstance(entry, str):
                    self._colnames.append(entry)
                elif isinstance(entry, stype):
                    self._colnames.append(colnames[i])
                    coltypes[i] = _coltypes.get(entry)
                elif isinstance(entry, tuple):
                    newname, newtype = entry
                    self._colnames.append(newname)
                    coltypes[i] = _coltypes.get(newtype)
                    if not coltypes[i]:
                        raise TValueError(
                            "Unknown type %r used as an override "
                            "for column %r" % (newtype, newname))
                else:
                    raise TTypeError(
                        "Entry `columns[%d]` has invalid type %r" %
                        (i, entry.__class__.__name__))
            return

        if isinstance(colspec, dict):
            for i in range(n):
                name = colnames[i]
                if name in colspec:
                    entry = colspec[name]
                else:
                    entry = colspec.get(..., ...)
                if entry is None:
                    coltypes[i] = 0
                elif entry is Ellipsis:
                    self._colnames.append(name)
                elif isinstance(entry, str):
                    self._colnames.append(entry)
                else:
                    assert isinstance(entry, tuple)
                    newname, newtype = entry
                    if newname is Ellipsis:
                        newname = name
                    self._colnames.append(newname)
                    coltypes[i] = _coltypes.get(newtype)
                    if not coltypes[i]:
                        raise TValueError(
                            "Unknown type %r used as an override "
                            "for column %r" % (newtype, newname))

        if callable(colspec) and hasattr(colspec, "__code__"):
            nargs = colspec.__code__.co_argcount

            if nargs == 1:
                for i in range(n):
                    ret = colspec(colnames[i])
                    if ret is None or ret is False:
                        coltypes[i] = 0
                    elif ret is True:
                        self._colnames.append(colnames[i])
                    elif isinstance(ret, str):
                        self._colnames.append(ret)
                    else:
                        raise TValueError("Function passed as the `columns` "
                                          "argument was expected to return a "
                                          "`Union[None, bool, str]` but "
                                          "instead returned value %r" %
                                          (ret, ))
                return

            if nargs == 2:
                for i in range(n):
                    ret = colspec(i, colnames[i])
                    if ret is None or ret is False:
                        coltypes[i] = 0
                    elif ret is True:
                        self._colnames.append(colnames[i])
                    elif isinstance(ret, str):
                        self._colnames.append(ret)
                    else:
                        raise TValueError("Function passed as the `columns` "
                                          "argument was expected to return a "
                                          "`Union[None, bool, str]` but "
                                          "instead returned value %r" %
                                          (ret, ))
                return

            if nargs == 3:
                for i in range(n):
                    typ = _coltypes_strs[coltypes[i]]
                    ret = colspec(i, colnames[i], typ)
                    if ret is None or ret is False:
                        coltypes[i] = 0
                    elif ret is True:
                        self._colnames.append(colnames[i])
                    elif isinstance(ret, str):
                        self._colnames.append(ret)
                    elif isinstance(ret, tuple) and len(ret) == 2:
                        newname, newtype = ret
                        self._colnames.append(newname)
                        coltypes[i] = _coltypes.get(newtype)
                    else:
                        raise TValueError("Function passed as the `columns` "
                                          "argument was expected to return a "
                                          "`Union[None, bool, str, Tuple[str, "
                                          "Union[str, type]]]` but "
                                          "instead returned value %r" % ret)
                return

            raise RuntimeError("Unknown colspec: %r"  # pragma: no cover
                               % colspec)