def process_column(col, df): """ Helper function to verify the validity of a single column selector. Given frame `df` and a column description `col`, this function returns: * either the numeric index of the column * a numeric slice, as a triple (start, count, step) * or a `BaseExpr` object """ if isinstance(col, int): ncols = df.ncols if -ncols <= col < ncols: return col % ncols else: raise TValueError( "Column index `{col}` is invalid for a frame with {ncolumns}". format(col=col, ncolumns=plural(ncols, "column"))) if isinstance(col, str): # This raises an exception if `col` cannot be found in the dataframe return df.colindex(col) if isinstance(col, slice): start = col.start stop = col.stop step = col.step if isinstance(start, str) or isinstance(stop, str): col0 = None col1 = None if start is None: col0 = 0 elif isinstance(start, str): col0 = df.colindex(start) if stop is None: col1 = df.ncols - 1 elif isinstance(stop, str): col1 = df.colindex(stop) if col0 is None or col1 is None: raise TValueError( "Slice %r is invalid: cannot mix numeric and " "string column names" % col) if step is not None: raise TValueError("Column name slices cannot use strides: %r" % col) return (col0, abs(col1 - col0) + 1, 1 if col1 >= col0 else -1) elif all(x is None or isinstance(x, int) for x in (start, stop, step)): return normalize_slice(col, df.ncols) else: raise TValueError("%r is not integer-valued" % col) if isinstance(col, ColSelectorExpr): col.resolve() return col.col_index if isinstance(col, BaseExpr): return col raise TTypeError("Unknown column selector: %r" % col)
def _apply_columns_slice(colslice, colsdesc): n = len(colsdesc) if isinstance(colslice, slice): start, count, step = normalize_slice(colslice, n) else: t = normalize_range(colslice, n) if t is None: raise TValueError("Invalid range iterator for a file with " "%d columns: %r" % (n, colslice)) start, count, step = t if step <= 0: raise TValueError("Cannot use slice/range with negative step " "for column filter: %r" % colslice) colnames = [None] * count coltypes = [rtype.rdrop.value] * n for j in range(count): i = start + j * step colnames[j] = colsdesc[i].name coltypes[i] = rtype.rauto.value return (colnames, coltypes)
def __getitem__(self, item): assert isinstance(item, slice) start, count, step = utils_misc.normalize_slice(item, len(self._src)) res1 = self._src[item] res2 = "".join(self._src[start + i * step] for i in range(count)) return res1 == res2
def make_rowfilter(rows, ee, _nested=False) -> RFNode: """ Create an :class:`RFNode` from the provided expression. This is a factory function that instantiates an appropriate subclass of :class:`RFNode`, depending on the provided argument `rows`. Parameters ---------- rows: An expression that will be converted into one of the RFNodes. This can have a variety of different types, see `help(Frame.__call__)` for more information. ee: EvaluationEngine The evaluation context within which the expression should be computed. _nested: bool, default False Internal attribute, used to avoid deep recursion when `make_rowfilter()` calls itself. When this attribute is False recursion is allowed, otherwise not. """ nrows = ee.dt.nrows if rows is Ellipsis or rows is None: return AllRFNode(ee) if rows is True or rows is False: # Note: True/False are integer objects in Python raise TTypeError("Boolean value cannot be used as a `rows` selector") if isinstance(rows, (int, slice, range)): rows = [rows] from_generator = False if isinstance(rows, types.GeneratorType): # If an iterator is given, materialize it first. Otherwise there # is no way to ensure that the produced indices are valid. rows = list(rows) from_generator = True if isinstance(rows, (list, tuple, set)): bases = [] counts = [] steps = [] for i, elem in enumerate(rows): if isinstance(elem, int): if -nrows <= elem < nrows: # `elem % nrows` forces the row number to become positive bases.append(elem % nrows) else: raise TValueError( "Row `%d` is invalid for datatable with %s" % (elem, plural(nrows, "row"))) elif isinstance(elem, (range, slice)): if elem.step == 0: raise TValueError("In %r step must not be 0" % elem) if not all(x is None or isinstance(x, int) for x in (elem.start, elem.stop, elem.step)): raise TValueError("%r is not integer-valued" % elem) if isinstance(elem, range): res = normalize_range(elem, nrows) if res is None: raise TValueError( "Invalid %r for a datatable with %s" % (elem, plural(nrows, "row"))) else: res = normalize_slice(elem, nrows) start, count, step = res assert count >= 0 if count == 0: pass # don't do anything elif count == 1: bases.append(start) else: if len(counts) < len(bases): counts += [1] * (len(bases) - len(counts)) steps += [1] * (len(bases) - len(steps)) bases.append(start) counts.append(count) steps.append(step) else: if from_generator: raise TValueError( "Invalid row selector %r generated at position %d" % (elem, i)) else: raise TValueError( "Invalid row selector %r at element %d of the " "`rows` list" % (elem, i)) if not counts: if len(bases) == 1: if bases[0] == 0 and nrows == 1: return AllRFNode(ee) return SliceRFNode(ee, bases[0], 1, 1) else: return ArrayRFNode(ee, bases) elif len(bases) == 1: if bases[0] == 0 and counts[0] == nrows and steps[0] == 1: return AllRFNode(ee) else: return SliceRFNode(ee, bases[0], counts[0], steps[0]) else: return MultiSliceRFNode(ee, bases, counts, steps) if is_type(rows, NumpyArray_t): arr = rows if not (len(arr.shape) == 1 or len(arr.shape) == 2 and min(arr.shape) == 1): raise TValueError( "Only a single-dimensional numpy.array is allowed" " as a `rows` argument, got %r" % arr) if len(arr.shape) == 2 and arr.shape[1] > 1: arr = arr.T if not (str(arr.dtype) == "bool" or str(arr.dtype).startswith("int")): raise TValueError("Either a boolean or an integer numpy.array is " "expected for `rows` argument, got %r" % arr) if str(arr.dtype) == "bool" and arr.shape[-1] != nrows: raise TValueError("Cannot apply a boolean numpy array of length " "%d to a datatable with %s" % (arr.shape[-1], plural(nrows, "row"))) rows = datatable.Frame(arr) assert rows.ncols == 1 assert rows.ltypes[0] == ltype.bool or rows.ltypes[0] == ltype.int if is_type(rows, Frame_t): if rows.ncols != 1: raise TValueError("`rows` argument should be a single-column " "datatable, got %r" % rows) col0type = rows.ltypes[0] if col0type == ltype.bool: if rows.nrows != nrows: s1rows = plural(rows.nrows, "row") s2rows = plural(nrows, "row") raise TValueError("`rows` datatable has %s, but applied to a " "datatable with %s" % (s1rows, s2rows)) return BooleanColumnRFNode(ee, rows) elif col0type == ltype.int: return IntegerColumnRFNode(ee, rows) else: raise TTypeError("`rows` datatable should be either a boolean or " "an integer column, however it has type %s" % col0type) if isinstance(rows, types.FunctionType): return make_rowfilter(rows(f), ee, _nested=True) if isinstance(rows, BaseExpr): return FilterExprRFNode(ee, rows) if _nested: raise TTypeError("Unexpected result produced by the `rows` " "function: %r" % (rows, )) else: raise TTypeError("Unexpected `rows` argument: %r" % (rows, ))
def _override_columns(self, colnames, coltypes): assert len(colnames) == len(coltypes) n = len(colnames) colspec = self._columns self._colnames = [] if colspec is None: self._colnames = colnames return if isinstance(colspec, (slice, range)): if isinstance(colspec, slice): start, count, step = normalize_slice(colspec, n) else: t = normalize_range(colspec, n) if t is None: raise TValueError("Invalid range iterator for a file with " "%d columns: %r" % (n, colspec)) start, count, step = t if step <= 0: raise TValueError("Cannot use slice/range with negative step " "for column filter: %r" % colspec) for i in range(n): if (i - start) % step == 0 and i < start + count * step: self._colnames.append(colnames[i]) else: coltypes[i] = 0 return if isinstance(colspec, set): # Make a copy of the `colspec`, in order to check whether all the # columns requested by the user were found, and issue a warning # otherwise. colsfound = set(colspec) for i in range(n): if colnames[i] in colspec: if colnames[i] in colsfound: colsfound.remove(colnames[i]) self._colnames.append(colnames[i]) else: coltypes[i] = 0 if colsfound: self.logger.warning( "Column(s) %r not found in the input file" % list(colsfound)) return if isinstance(colspec, (list, tuple)): nn = len(colspec) if n != nn: raise TValueError("Input file contains %s, whereas `columns` " "parameter specifies only %s" % (plural(n, "column"), plural(nn, "column"))) for i in range(n): entry = colspec[i] if entry is None: coltypes[i] = 0 elif isinstance(entry, str): self._colnames.append(entry) elif isinstance(entry, stype): self._colnames.append(colnames[i]) coltypes[i] = _coltypes.get(entry) elif isinstance(entry, tuple): newname, newtype = entry self._colnames.append(newname) coltypes[i] = _coltypes.get(newtype) if not coltypes[i]: raise TValueError( "Unknown type %r used as an override " "for column %r" % (newtype, newname)) else: raise TTypeError( "Entry `columns[%d]` has invalid type %r" % (i, entry.__class__.__name__)) return if isinstance(colspec, dict): for i in range(n): name = colnames[i] if name in colspec: entry = colspec[name] else: entry = colspec.get(..., ...) if entry is None: coltypes[i] = 0 elif entry is Ellipsis: self._colnames.append(name) elif isinstance(entry, str): self._colnames.append(entry) else: assert isinstance(entry, tuple) newname, newtype = entry if newname is Ellipsis: newname = name self._colnames.append(newname) coltypes[i] = _coltypes.get(newtype) if not coltypes[i]: raise TValueError( "Unknown type %r used as an override " "for column %r" % (newtype, newname)) if callable(colspec) and hasattr(colspec, "__code__"): nargs = colspec.__code__.co_argcount if nargs == 1: for i in range(n): ret = colspec(colnames[i]) if ret is None or ret is False: coltypes[i] = 0 elif ret is True: self._colnames.append(colnames[i]) elif isinstance(ret, str): self._colnames.append(ret) else: raise TValueError("Function passed as the `columns` " "argument was expected to return a " "`Union[None, bool, str]` but " "instead returned value %r" % (ret, )) return if nargs == 2: for i in range(n): ret = colspec(i, colnames[i]) if ret is None or ret is False: coltypes[i] = 0 elif ret is True: self._colnames.append(colnames[i]) elif isinstance(ret, str): self._colnames.append(ret) else: raise TValueError("Function passed as the `columns` " "argument was expected to return a " "`Union[None, bool, str]` but " "instead returned value %r" % (ret, )) return if nargs == 3: for i in range(n): typ = _coltypes_strs[coltypes[i]] ret = colspec(i, colnames[i], typ) if ret is None or ret is False: coltypes[i] = 0 elif ret is True: self._colnames.append(colnames[i]) elif isinstance(ret, str): self._colnames.append(ret) elif isinstance(ret, tuple) and len(ret) == 2: newname, newtype = ret self._colnames.append(newname) coltypes[i] = _coltypes.get(newtype) else: raise TValueError("Function passed as the `columns` " "argument was expected to return a " "`Union[None, bool, str, Tuple[str, " "Union[str, type]]]` but " "instead returned value %r" % ret) return raise RuntimeError("Unknown colspec: %r" # pragma: no cover % colspec)