def _apply_columns_list(self, collist, colsdesc): n = len(colsdesc) nn = len(collist) if n != nn: raise TValueError("Input contains %s, whereas `columns` " "parameter specifies only %s" % (plural(n, "column"), plural(nn, "column"))) colnames = [] coltypes = [rtype.rdrop.value] * n for i in range(n): entry = collist[i] if entry is None or entry is False: pass elif entry is True or entry is Ellipsis: colnames.append(colsdesc[i].name) coltypes[i] = rtype.rauto.value elif isinstance(entry, str): colnames.append(entry) coltypes[i] = rtype.rauto.value elif isinstance(entry, (stype, ltype, type)): colnames.append(colsdesc[i].name) coltypes[i] = _rtypes_map[entry].value elif isinstance(entry, tuple): newname, newtype = entry if newtype not in _rtypes_map: raise TValueError("Unknown type %r used as an override " "for column %r" % (newtype, newname)) colnames.append(newname) coltypes[i] = _rtypes_map[newtype].value else: raise TTypeError("Entry `columns[%d]` has invalid type %r" % (i, entry.__class__.__name__)) self._colnames = colnames return coltypes
def colindex(self, name): """ Return index of the column ``name``. :param name: name of the column to find the index for. This can also be an index of a column, in which case the index is checked that it doesn't go out-of-bounds, and negative index is converted into positive. :raises ValueError: if the requested column does not exist. """ if isinstance(name, str): if name in self._inames: return self._inames[name] else: raise TValueError("Column `%s` does not exist in %r" % (name, self)) else: n = self._ncols if 0 <= name < n: return name elif -n <= name < 0: return name + n else: raise TValueError("Column index `%d` is invalid for a " "datatable with %s" % (name, plural(n, "column")))
def process_column(col, df): """ Helper function to verify the validity of a single column selector. Given frame `df` and a column description `col`, this function returns: * either the numeric index of the column * a numeric slice, as a triple (start, count, step) * or a `BaseExpr` object """ if isinstance(col, int): ncols = df.ncols if -ncols <= col < ncols: return col % ncols else: raise TValueError( "Column index `{col}` is invalid for a frame with {ncolumns}". format(col=col, ncolumns=plural(ncols, "column"))) if isinstance(col, str): # This raises an exception if `col` cannot be found in the dataframe return df.colindex(col) if isinstance(col, slice): start = col.start stop = col.stop step = col.step if isinstance(start, str) or isinstance(stop, str): col0 = None col1 = None if start is None: col0 = 0 elif isinstance(start, str): col0 = df.colindex(start) if stop is None: col1 = df.ncols - 1 elif isinstance(stop, str): col1 = df.colindex(stop) if col0 is None or col1 is None: raise TValueError( "Slice %r is invalid: cannot mix numeric and " "string column names" % col) if step is not None: raise TValueError("Column name slices cannot use strides: %r" % col) return (col0, abs(col1 - col0) + 1, 1 if col1 >= col0 else -1) elif all(x is None or isinstance(x, int) for x in (start, stop, step)): return normalize_slice(col, df.ncols) else: raise TValueError("%r is not integer-valued" % col) if isinstance(col, ColSelectorExpr): col.resolve() return col.col_index if isinstance(col, BaseExpr): return col raise TTypeError("Unknown column selector: %r" % col)
def sort_columns(self, frame): if frame.ncols == 0: return ncols_sort = min(int(random.expovariate(1.0)) + 1, frame.ncols) a = random.sample(range(0, frame.ncols), ncols_sort) print("[10] Sorting %s in ascending order: %r" % (plural(len(a), "column"), a)) if python_output: python_output.write("DT = DT.sort(%r)\n" % a) frame.sort_columns(a)
def set_key_columns(self, frame): if frame.ncols == 0: return nkeys = min(int(random.expovariate(1.0)) + 1, frame.ncols) keys = random.sample(range(0, frame.ncols), nkeys) names = [frame.names[i] for i in keys] print("[13] Setting %s: %r" % (plural(nkeys, "key column"), keys)) res = frame.set_key_columns(keys, names) if python_output: if res: python_output.write("DT.key = %r\n" % names) else: python_output.write("with pytest.raises(ValueError, " "match='Cannot set a key: the values are " "not unique'):\n" " DT.key = %r\n\n" % names)
def rename(self, columns: Union[Dict[str, str], Dict[int, str], List[str], Tuple[str, ...]]): """ Rename columns of the datatable. :param columns: dictionary of the {old_name: new_name} entries. :returns: None """ if isinstance(columns, (list, tuple)): names = columns if len(names) != self._ncols: raise TValueError("Cannot rename columns to %r: expected %s" % (names, plural(self._ncols, "name"))) else: names = list(self._names) for oldname, newname in columns.items(): idx = self.colindex(oldname) names[idx] = newname self._fill_from_dt(self._dt, names=names)
def _rbind(self, *frames, force=False, bynames=True): """ Append rows of `frames` to the current Frame. This is equivalent to `list.extend()` in Python: the Frames are combined by rows, i.e. rbinding a Frame of shape [n x k] to a Frame of shape [m x k] produces a Frame of shape [(m + n) x k]. This method modifies the current Frame in-place. If you do not want the current Frame modified, then append all Frames to an empty Frame: `dt.Frame().rbind(frame1, frame2)`. If Frame(s) being appended have columns of types different from the current Frame, then these columns will be promoted to the largest of two types: bool -> int -> float -> string. If you need to append multiple Frames, then it is more efficient to collect them into an array first and then do a single `rbind()`, than it is to append them one-by-one. Appending data to a Frame opened from disk will force loading the current Frame into memory, which may fail with an OutOfMemory exception. Parameters ---------- frames: sequence or list of Frames One or more Frame to append. These Frames should have the same columnar structure as the current Frame (unless option `force` is used). force: boolean, default False If True, then the Frames are allowed to have mismatching set of columns. Any gaps in the data will be filled with NAs. bynames: boolean, default True If True, the columns in Frames are matched by their names. For example, if one Frame has columns ["colA", "colB", "colC"] and the other ["colB", "colA", "colC"] then we will swap the order of the first two columns of the appended Frame before performing the append. However if `bynames` is False, then the column names will be ignored, and the columns will be matched according to their order, i.e. i-th column in the current Frame to the i-th column in each appended Frame. """ n = self.ncols # `spec` will be the description of how the DataTables are to be merged: # it is a list of tuples (core.DataTable, Optional[List[int]]), where the # first item in the tuple is a Frame being appended, and the second item # is the array of column indices within that Frame. For example, if the # array is [1, 0, None, 2, None] then it means that we need to take the # Frame being appended, reorder its columns as (2nd column, 1st column, # column of NAs, 3rd column, column of NAs) and only then "stitch" to the # resulting Frame of 5 columns. spec = [] final_names = list(self.names) # Append by column names, filling with NAs as necessary if bynames: # `inames` is a mapping of column_name => column_index. inames = {} for i, col in enumerate(final_names): inames[col] = i for df in frames: _dt = df.internal if df.nrows == 0: continue if n == 0: n = df.ncols final_names = list(df.names) for i, col in enumerate(df.names): inames[col] = i elif not (df.ncols == n or force): raise TValueError( "Cannot rbind frame with %s to a frame with %s. If" " you wish to rbind the frames anyways, filling missing " "values with NAs, then use `force=True`" % (plural(df.ncols, "column"), plural(n, "column"))) if final_names == list(df.names): spec.append((_dt, None)) continue # Column mapping that specifies which column of `df` should be # appended where in the result. res = [None] * len(final_names) for i, col in enumerate(df.names): icol = inames.get(col) if icol is not None: res[icol] = i elif force: final_names.append(col) inames[col] = len(final_names) - 1 res.append(i) n += 1 else: raise TValueError( "Column `%s` is not found in the source frame. " "If you want to rbind the frames anyways filling " "missing values with NAs, then use `force=True`" % col) spec.append((_dt, res)) # Append by column numbers else: for df in frames: _dt = df.internal if df.nrows == 0: continue if n == 0: n = df.ncols final_names = list(df.names) if df.ncols != n: if not force: raise TValueError( "Cannot rbind frame with %s to a frame with %s. If you " "wish to rbind the Frames anyways filling missing " "values with NAs, then use option `force=True`" % (plural(df.ncols, "column"), plural(n, "column"))) elif df.ncols > n: final_names += list(df.names[n:]) n = df.ncols spec.append((_dt, None)) # Perform the append operation on C level _dt = self.internal _dt.rbind(len(final_names), spec) self.names = final_names return self
def _cbind(self, *frames, force=False, inplace=True): """ Append columns of Frames `frames` to the current Frame. This is equivalent to `pandas.concat(axis=1)`: the Frames are combined by columns, i.e. cbinding a Frame of shape [n x m] to a Frame of shape [n x k] produces a Frame of shape [n x (m + k)]. As a special case, if you cbind a single-row Frame, then that row will be replicated as many times as there are rows in the current Frame. This makes it easy to create constant columns, or to append reduction results (such as min/max/mean/etc) to the current Frame. If Frame(s) being appended have different number of rows (with the exception of Frames having 1 row), then the operation will fail by default. You can force cbinding these Frames anyways by providing option `force=True`: this will fill all "short" Frames with NAs. Thus there is a difference in how Frames with 1 row are treated compared to Frames with any other number of rows. Parameters ---------- frames: sequence or list of Frames One or more Frame to append. They should have the same number of rows (unless option `force` is also used). force: boolean, default False If True, allows Frames to be appended even if they have unequal number of rows. The resulting Frame will have number of rows equal to the largest among all Frames. Those Frames which have less than the largest number of rows, will be padded with NAs (with the exception of Frames having just 1 row, which will be replicated instead of filling with NAs). inplace: boolean, default True [DEPRECATED] If True, then the data is appended to the current Frame in-place, causing it to be modified. If False, then a new Frame will be constructed and returned instead (and no existing Frames will be modified). Returns ------- The current Frame, modified, if `inplace` is True; or a new Frame containing all Frames concatenated, if `inplace` is False. """ datatables = [] # Which Frame to operate upon. If not `inplace` then we will create # a blank Frame and merge everything to it. src = self if not inplace: src = dt.Frame() datatables.append(self.internal) # Check that all Frames have compatible number of rows, and compose the # list of _DataTables to be passed down into the C level. nrows = src.nrows or -1 for df in frames: if df.ncols == 0: continue nn = df.nrows if nrows == -1: nrows = nn if not(nn == nrows or nn == 1 or force): if nrows <= 1: nrows = nn else: raise TValueError( "Cannot merge Frame with %s to a Frame with %s. If " "you want to disregard this warning and merge Frames " "anyways, then use option `force=True`" % (plural(nn, "row"), plural(nrows, "row"))) datatables.append(df.internal) _dt = src.internal _dt.cbind(datatables) return src
def __repr__(self): srows = plural(self.nrows, "row") scols = plural(self.ncols, "col") return "<Frame [%s x %s]>" % (srows, scols)
def make_rowfilter(rows, ee, _nested=False) -> RFNode: """ Create an :class:`RFNode` from the provided expression. This is a factory function that instantiates an appropriate subclass of :class:`RFNode`, depending on the provided argument `rows`. Parameters ---------- rows: An expression that will be converted into one of the RFNodes. This can have a variety of different types, see `help(Frame.__call__)` for more information. ee: EvaluationEngine The evaluation context within which the expression should be computed. _nested: bool, default False Internal attribute, used to avoid deep recursion when `make_rowfilter()` calls itself. When this attribute is False recursion is allowed, otherwise not. """ nrows = ee.dt.nrows if rows is Ellipsis or rows is None: return AllRFNode(ee) if rows is True or rows is False: # Note: True/False are integer objects in Python raise TTypeError("Boolean value cannot be used as a `rows` selector") if isinstance(rows, (int, slice, range)): rows = [rows] from_generator = False if isinstance(rows, types.GeneratorType): # If an iterator is given, materialize it first. Otherwise there # is no way to ensure that the produced indices are valid. rows = list(rows) from_generator = True if isinstance(rows, (list, tuple, set)): bases = [] counts = [] steps = [] for i, elem in enumerate(rows): if isinstance(elem, int): if -nrows <= elem < nrows: # `elem % nrows` forces the row number to become positive bases.append(elem % nrows) else: raise TValueError( "Row `%d` is invalid for datatable with %s" % (elem, plural(nrows, "row"))) elif isinstance(elem, (range, slice)): if elem.step == 0: raise TValueError("In %r step must not be 0" % elem) if not all(x is None or isinstance(x, int) for x in (elem.start, elem.stop, elem.step)): raise TValueError("%r is not integer-valued" % elem) if isinstance(elem, range): res = normalize_range(elem, nrows) if res is None: raise TValueError( "Invalid %r for a datatable with %s" % (elem, plural(nrows, "row"))) else: res = normalize_slice(elem, nrows) start, count, step = res assert count >= 0 if count == 0: pass # don't do anything elif count == 1: bases.append(start) else: if len(counts) < len(bases): counts += [1] * (len(bases) - len(counts)) steps += [1] * (len(bases) - len(steps)) bases.append(start) counts.append(count) steps.append(step) else: if from_generator: raise TValueError( "Invalid row selector %r generated at position %d" % (elem, i)) else: raise TValueError( "Invalid row selector %r at element %d of the " "`rows` list" % (elem, i)) if not counts: if len(bases) == 1: if bases[0] == 0 and nrows == 1: return AllRFNode(ee) return SliceRFNode(ee, bases[0], 1, 1) else: return ArrayRFNode(ee, bases) elif len(bases) == 1: if bases[0] == 0 and counts[0] == nrows and steps[0] == 1: return AllRFNode(ee) else: return SliceRFNode(ee, bases[0], counts[0], steps[0]) else: return MultiSliceRFNode(ee, bases, counts, steps) if is_type(rows, NumpyArray_t): arr = rows if not (len(arr.shape) == 1 or len(arr.shape) == 2 and min(arr.shape) == 1): raise TValueError( "Only a single-dimensional numpy.array is allowed" " as a `rows` argument, got %r" % arr) if len(arr.shape) == 2 and arr.shape[1] > 1: arr = arr.T if not (str(arr.dtype) == "bool" or str(arr.dtype).startswith("int")): raise TValueError("Either a boolean or an integer numpy.array is " "expected for `rows` argument, got %r" % arr) if str(arr.dtype) == "bool" and arr.shape[-1] != nrows: raise TValueError("Cannot apply a boolean numpy array of length " "%d to a datatable with %s" % (arr.shape[-1], plural(nrows, "row"))) rows = datatable.Frame(arr) assert rows.ncols == 1 assert rows.ltypes[0] == ltype.bool or rows.ltypes[0] == ltype.int if is_type(rows, Frame_t): if rows.ncols != 1: raise TValueError("`rows` argument should be a single-column " "datatable, got %r" % rows) col0type = rows.ltypes[0] if col0type == ltype.bool: if rows.nrows != nrows: s1rows = plural(rows.nrows, "row") s2rows = plural(nrows, "row") raise TValueError("`rows` datatable has %s, but applied to a " "datatable with %s" % (s1rows, s2rows)) return BooleanColumnRFNode(ee, rows) elif col0type == ltype.int: return IntegerColumnRFNode(ee, rows) else: raise TTypeError("`rows` datatable should be either a boolean or " "an integer column, however it has type %s" % col0type) if isinstance(rows, types.FunctionType): return make_rowfilter(rows(f), ee, _nested=True) if isinstance(rows, BaseExpr): return FilterExprRFNode(ee, rows) if _nested: raise TTypeError("Unexpected result produced by the `rows` " "function: %r" % (rows, )) else: raise TTypeError("Unexpected `rows` argument: %r" % (rows, ))
def __repr__(self): srows = plural(self._nrows, "row") scols = plural(self._ncols, "col") return "<Frame #%d (%s x %s)>" % (self._id, srows, scols)
def _override_columns(self, colnames, coltypes): assert len(colnames) == len(coltypes) n = len(colnames) colspec = self._columns self._colnames = [] if colspec is None: self._colnames = colnames return if isinstance(colspec, (slice, range)): if isinstance(colspec, slice): start, count, step = normalize_slice(colspec, n) else: t = normalize_range(colspec, n) if t is None: raise TValueError("Invalid range iterator for a file with " "%d columns: %r" % (n, colspec)) start, count, step = t if step <= 0: raise TValueError("Cannot use slice/range with negative step " "for column filter: %r" % colspec) for i in range(n): if (i - start) % step == 0 and i < start + count * step: self._colnames.append(colnames[i]) else: coltypes[i] = 0 return if isinstance(colspec, set): # Make a copy of the `colspec`, in order to check whether all the # columns requested by the user were found, and issue a warning # otherwise. colsfound = set(colspec) for i in range(n): if colnames[i] in colspec: if colnames[i] in colsfound: colsfound.remove(colnames[i]) self._colnames.append(colnames[i]) else: coltypes[i] = 0 if colsfound: self.logger.warning( "Column(s) %r not found in the input file" % list(colsfound)) return if isinstance(colspec, (list, tuple)): nn = len(colspec) if n != nn: raise TValueError("Input file contains %s, whereas `columns` " "parameter specifies only %s" % (plural(n, "column"), plural(nn, "column"))) for i in range(n): entry = colspec[i] if entry is None: coltypes[i] = 0 elif isinstance(entry, str): self._colnames.append(entry) elif isinstance(entry, stype): self._colnames.append(colnames[i]) coltypes[i] = _coltypes.get(entry) elif isinstance(entry, tuple): newname, newtype = entry self._colnames.append(newname) coltypes[i] = _coltypes.get(newtype) if not coltypes[i]: raise TValueError( "Unknown type %r used as an override " "for column %r" % (newtype, newname)) else: raise TTypeError( "Entry `columns[%d]` has invalid type %r" % (i, entry.__class__.__name__)) return if isinstance(colspec, dict): for i in range(n): name = colnames[i] if name in colspec: entry = colspec[name] else: entry = colspec.get(..., ...) if entry is None: coltypes[i] = 0 elif entry is Ellipsis: self._colnames.append(name) elif isinstance(entry, str): self._colnames.append(entry) else: assert isinstance(entry, tuple) newname, newtype = entry if newname is Ellipsis: newname = name self._colnames.append(newname) coltypes[i] = _coltypes.get(newtype) if not coltypes[i]: raise TValueError( "Unknown type %r used as an override " "for column %r" % (newtype, newname)) if callable(colspec) and hasattr(colspec, "__code__"): nargs = colspec.__code__.co_argcount if nargs == 1: for i in range(n): ret = colspec(colnames[i]) if ret is None or ret is False: coltypes[i] = 0 elif ret is True: self._colnames.append(colnames[i]) elif isinstance(ret, str): self._colnames.append(ret) else: raise TValueError("Function passed as the `columns` " "argument was expected to return a " "`Union[None, bool, str]` but " "instead returned value %r" % (ret, )) return if nargs == 2: for i in range(n): ret = colspec(i, colnames[i]) if ret is None or ret is False: coltypes[i] = 0 elif ret is True: self._colnames.append(colnames[i]) elif isinstance(ret, str): self._colnames.append(ret) else: raise TValueError("Function passed as the `columns` " "argument was expected to return a " "`Union[None, bool, str]` but " "instead returned value %r" % (ret, )) return if nargs == 3: for i in range(n): typ = _coltypes_strs[coltypes[i]] ret = colspec(i, colnames[i], typ) if ret is None or ret is False: coltypes[i] = 0 elif ret is True: self._colnames.append(colnames[i]) elif isinstance(ret, str): self._colnames.append(ret) elif isinstance(ret, tuple) and len(ret) == 2: newname, newtype = ret self._colnames.append(newname) coltypes[i] = _coltypes.get(newtype) else: raise TValueError("Function passed as the `columns` " "argument was expected to return a " "`Union[None, bool, str, Tuple[str, " "Union[str, type]]]` but " "instead returned value %r" % ret) return raise RuntimeError("Unknown colspec: %r" # pragma: no cover % colspec)