Exemplo n.º 1
0
 def _fill_from_source(self, src, names, stypes):
     if isinstance(src, list):
         if len(src) == 0:
             src = [src]
         self._fill_from_list(src, names=names, stypes=stypes)
     elif isinstance(src, (tuple, set, range)):
         self._fill_from_list([list(src)], names=names, stypes=stypes)
     elif isinstance(src, dict):
         self._fill_from_list(list(src.values()),
                              names=tuple(src.keys()),
                              stypes=stypes)
     elif isinstance(src, core.DataTable):
         self._fill_from_dt(src, names=names)
     elif isinstance(src, str):
         srcdt = datatable.fread(src)
         if names is None:
             names = srcdt.names
         self._fill_from_dt(srcdt.internal, names=names)
     elif src is None:
         self._fill_from_list([], names=None, stypes=None)
     elif is_type(src, Frame_t):
         if names is None:
             names = src.names
         _dt = core.columns_from_slice(src.internal, None, 0, src.ncols, 1) \
                   .to_datatable()
         self._fill_from_dt(_dt, names=names)
     elif is_type(src, PandasDataFrame_t, PandasSeries_t):
         self._fill_from_pandas(src, names)
     elif is_type(src, NumpyArray_t):
         self._fill_from_numpy(src, names=names)
     elif src is Ellipsis:
         self._fill_from_list([42], "?", None)
     else:
         raise TTypeError("Cannot create Frame from %r" % src)
Exemplo n.º 2
0
    def _fill_from_numpy(self, arr, names):
        dim = len(arr.shape)
        if dim > 2:
            raise TValueError("Cannot create Frame from a %d-D numpy "
                              "array %r" % (dim, arr))
        if dim == 0:
            arr = arr.reshape((1, 1))
        if dim == 1:
            arr = arr.reshape((len(arr), 1))
        if not arr.dtype.isnative:
            arr = arr.byteswap().newbyteorder()
        if str(arr.dtype) == "float16":
            arr = arr.astype("float32")

        ncols = arr.shape[1]
        if is_type(arr, NumpyMaskedArray_t):
            dt = core.datatable_from_list(
                [arr.data[:, i] for i in range(ncols)], None)
            mask = core.datatable_from_list(
                [arr.mask[:, i] for i in range(ncols)], None)
            dt.apply_na_mask(mask)
        else:
            dt = core.datatable_from_list([arr[:, i] for i in range(ncols)],
                                          None)

        if names is None:
            names = [None] * ncols
        self._fill_from_dt(dt, names=names)
Exemplo n.º 3
0
 def register_option(self, key, xtype, default, doc=None):
     assert isinstance(key, str)
     idot = key.find(".")
     if idot == 0:
         raise TValueError("Invalid option name `%s`" % key)
     elif idot > 0:
         prekey = key[:idot]
         preval = self._keyvals.get(prekey, None)
         if preval is None:
             preval = DtConfig(self._prefix + prekey)
             self._keyvals[prekey] = preval
         if isinstance(preval, DtConfig):
             subkey = key[idot + 1:]
             preval.register_option(subkey, xtype, default, doc)
         else:
             fullkey = self._prefix + key
             fullprekey = self._prefix + prekey
             raise TValueError("Cannot register option `%s` because `%s` "
                               "is already registered as an option"
                               % (fullkey, fullprekey))
     elif key in self._keyvals:
         fullkey = self._prefix + key
         raise TValueError("Option `%s` already registered" % fullkey)
     elif not (xtype is callable or is_type(default, xtype)):
         raise TValueError("Default value `%s` is not of type %s"
                           % (default, name_type(xtype)))
     else:
         opt = DtOption(xtype=xtype, default=default, doc=doc,
                        name=self._prefix + key)
         self._keyvals[key] = opt
Exemplo n.º 4
0
 def _fill_from_pandas(self, pddf, names=None):
     if is_type(pddf, PandasDataFrame_t):
         if names is None:
             names = [str(c) for c in pddf.columns]
         colarrays = [pddf[c].values for c in pddf.columns]
     elif is_type(pddf, PandasSeries_t):
         colarrays = [pddf.values]
     else:
         raise TTypeError("Unexpected type of parameter %r" % pddf)
     for i in range(len(colarrays)):
         coldtype = colarrays[i].dtype
         if not coldtype.isnative:
             # Array has wrong endianness -- coerce into native byte-order
             colarrays[i] = colarrays[i].byteswap().newbyteorder()
             coldtype = colarrays[i].dtype
             assert coldtype.isnative
         if coldtype.char == 'e' and str(coldtype) == "float16":
             colarrays[i] = colarrays[i].astype("float32")
     dt = core.datatable_from_list(colarrays, None)
     self._fill_from_dt(dt, names=names)
Exemplo n.º 5
0
 def __setattr__(self, key, val):
     opt = self._get_opt(key)
     if isinstance(opt, DtOption):
         if is_type(val, opt.xtype):
             opt.value = val
         else:
             fullkey = self._prefix + key
             exptype = name_type(opt.xtype)
             acttype = name_type(type(val))
             raise TTypeError("Invalid value for option `%s`: expected "
                              "type %s, got %s instead" %
                              (fullkey, exptype, acttype))
     else:
         raise DtAttributeError("Cannot modify group of options `%s`" %
                                (self._prefix + key))
Exemplo n.º 6
0
 def _fill_from_list(self, src, names, stypes):
     for i in range(len(src)):
         e = src[i]
         if isinstance(e, range):
             src[i] = list(e)
         elif isinstance(e, list) or is_type(e, NumpyArray_t):
             pass
         else:
             if i == 0:
                 src = [src]
             break
     types = None
     if stypes:
         if len(stypes) == 1:
             types = [stype(stypes[0]).value] * len(src)
         elif len(stypes) == len(src):
             types = [stype(s).value for s in stypes]
         else:
             raise TValueError("Number of stypes (%d) is different from "
                               "the number of source columns (%d)" %
                               (len(stypes), len(src)))
     _dt = core.datatable_from_list(src, types)
     self._fill_from_dt(_dt, names=names)
Exemplo n.º 7
0
def make_rowfilter(rows, ee, _nested=False) -> RFNode:
    """
    Create an :class:`RFNode` from the provided expression.

    This is a factory function that instantiates an appropriate subclass of
    :class:`RFNode`, depending on the provided argument `rows`.

    Parameters
    ----------
    rows:
        An expression that will be converted into one of the RFNodes. This can
        have a variety of different types, see `help(Frame.__call__)` for
        more information.

    ee: EvaluationEngine
        The evaluation context within which the expression should be computed.

    _nested: bool, default False
        Internal attribute, used to avoid deep recursion when `make_rowfilter()`
        calls itself. When this attribute is False recursion is allowed,
        otherwise not.
    """
    nrows = ee.dt.nrows
    if rows is Ellipsis or rows is None:
        return AllRFNode(ee)

    if rows is True or rows is False:
        # Note: True/False are integer objects in Python
        raise TTypeError("Boolean value cannot be used as a `rows` selector")

    if isinstance(rows, (int, slice, range)):
        rows = [rows]

    from_generator = False
    if isinstance(rows, types.GeneratorType):
        # If an iterator is given, materialize it first. Otherwise there
        # is no way to ensure that the produced indices are valid.
        rows = list(rows)
        from_generator = True

    if isinstance(rows, (list, tuple, set)):
        bases = []
        counts = []
        steps = []
        for i, elem in enumerate(rows):
            if isinstance(elem, int):
                if -nrows <= elem < nrows:
                    # `elem % nrows` forces the row number to become positive
                    bases.append(elem % nrows)
                else:
                    raise TValueError(
                        "Row `%d` is invalid for datatable with %s" %
                        (elem, plural(nrows, "row")))
            elif isinstance(elem, (range, slice)):
                if elem.step == 0:
                    raise TValueError("In %r step must not be 0" % elem)
                if not all(x is None or isinstance(x, int)
                           for x in (elem.start, elem.stop, elem.step)):
                    raise TValueError("%r is not integer-valued" % elem)
                if isinstance(elem, range):
                    res = normalize_range(elem, nrows)
                    if res is None:
                        raise TValueError(
                            "Invalid %r for a datatable with %s" %
                            (elem, plural(nrows, "row")))
                else:
                    res = normalize_slice(elem, nrows)
                start, count, step = res
                assert count >= 0
                if count == 0:
                    pass  # don't do anything
                elif count == 1:
                    bases.append(start)
                else:
                    if len(counts) < len(bases):
                        counts += [1] * (len(bases) - len(counts))
                        steps += [1] * (len(bases) - len(steps))
                    bases.append(start)
                    counts.append(count)
                    steps.append(step)
            else:
                if from_generator:
                    raise TValueError(
                        "Invalid row selector %r generated at position %d" %
                        (elem, i))
                else:
                    raise TValueError(
                        "Invalid row selector %r at element %d of the "
                        "`rows` list" % (elem, i))
        if not counts:
            if len(bases) == 1:
                if bases[0] == 0 and nrows == 1:
                    return AllRFNode(ee)
                return SliceRFNode(ee, bases[0], 1, 1)
            else:
                return ArrayRFNode(ee, bases)
        elif len(bases) == 1:
            if bases[0] == 0 and counts[0] == nrows and steps[0] == 1:
                return AllRFNode(ee)
            else:
                return SliceRFNode(ee, bases[0], counts[0], steps[0])
        else:
            return MultiSliceRFNode(ee, bases, counts, steps)

    if is_type(rows, NumpyArray_t):
        arr = rows
        if not (len(arr.shape) == 1
                or len(arr.shape) == 2 and min(arr.shape) == 1):
            raise TValueError(
                "Only a single-dimensional numpy.array is allowed"
                " as a `rows` argument, got %r" % arr)
        if len(arr.shape) == 2 and arr.shape[1] > 1:
            arr = arr.T
        if not (str(arr.dtype) == "bool" or str(arr.dtype).startswith("int")):
            raise TValueError("Either a boolean or an integer numpy.array is "
                              "expected for `rows` argument, got %r" % arr)
        if str(arr.dtype) == "bool" and arr.shape[-1] != nrows:
            raise TValueError("Cannot apply a boolean numpy array of length "
                              "%d to a datatable with %s" %
                              (arr.shape[-1], plural(nrows, "row")))
        rows = datatable.Frame(arr)
        assert rows.ncols == 1
        assert rows.ltypes[0] == ltype.bool or rows.ltypes[0] == ltype.int

    if is_type(rows, Frame_t):
        if rows.ncols != 1:
            raise TValueError("`rows` argument should be a single-column "
                              "datatable, got %r" % rows)
        col0type = rows.ltypes[0]
        if col0type == ltype.bool:
            if rows.nrows != nrows:
                s1rows = plural(rows.nrows, "row")
                s2rows = plural(nrows, "row")
                raise TValueError("`rows` datatable has %s, but applied to a "
                                  "datatable with %s" % (s1rows, s2rows))
            return BooleanColumnRFNode(ee, rows)
        elif col0type == ltype.int:
            return IntegerColumnRFNode(ee, rows)
        else:
            raise TTypeError("`rows` datatable should be either a boolean or "
                             "an integer column, however it has type %s" %
                             col0type)

    if isinstance(rows, types.FunctionType):
        return make_rowfilter(rows(f), ee, _nested=True)

    if isinstance(rows, BaseExpr):
        return FilterExprRFNode(ee, rows)

    if _nested:
        raise TTypeError("Unexpected result produced by the `rows` "
                         "function: %r" % (rows, ))
    else:
        raise TTypeError("Unexpected `rows` argument: %r" % (rows, ))