示例#1
0
 def test_fast_iterator(self):
     a = Vector.fast(list(range(10)))
     b = Vector.fast(range(10))
     c = Vector.fast(x for x in range(10))
     d = Vector.fast(map(lambda x: x, range(10)))
     assert a.equal(b)
     assert a.equal(c)
     assert a.equal(d)
示例#2
0
    def aggregate(self, **colname_function_pairs):
        """
        Return group-wise calculated summaries.

        Usually aggregation is preceded by grouping, which can be conveniently
        written via method chaining as ``data.group_by(...).aggregate(...)``.

        In `colname_function_pairs`, `function` receives as an argument a data
        frame object, a group-wise subset of all rows. It should return a
        scalar value. Common aggregation functions have shorthand helpers
        available under :mod:`dataiter`, see the guide on :doc:`aggregation
        </aggregation>` for details.

        >>> data = di.read_csv("data/listings.csv")
        >>> # The below aggregations are identical. Usually you'll get by
        >>> # with the shorthand helpers, but for complicated calculations,
        >>> # you might need custom lambda functions.
        >>> data.group_by("hood").aggregate(n=di.count(), price=di.mean("price"))
        >>> data.group_by("hood").aggregate(n=lambda x: x.nrow, price=lambda x: x.price.mean())
        """
        group_colnames = self._group_colnames
        data = self.sort(**dict.fromkeys(group_colnames, 1))
        data._index_ = np.arange(data.nrow)
        stat = data.unique(*group_colnames).select("_index_", *group_colnames)
        indices = np.split(data._index_, stat._index_[1:])
        group_aware = [getattr(x, "group_aware", False) for x in colname_function_pairs.values()]
        if any(group_aware):
            groups = Vector.fast(range(len(indices)), int)
            n = Vector.fast(map(len, indices), int)
            data._group_ = np.repeat(groups, n)
        slices = None
        for colname, function in colname_function_pairs.items():
            if getattr(function, "group_aware", False):
                # function might leave Nones in its output,
                # once those are replaced with the proper default
                # we can do a fast conversion to DataFrameColumn.
                column = function(data)
                default = function.default
                for i in range(len(column)):
                    if column[i] is None:
                        column[i] = default
                assert len(column) == stat.nrow
                column = DataFrameColumn.fast(column)
                stat[colname] = column
            else:
                # When using an arbitrary function, we cannot know
                # what special values to expect and thus we end up
                # needing to use the slow Vector.__init__.
                if slices is None:
                    slices = [data._view_rows(x) for x in indices]
                stat[colname] = [function(x) for x in slices]
        return stat.unselect("_index_", "_group_")
示例#3
0
    def filter_out(self, rows=None, **colname_value_pairs):
        """
        Return rows that don't match condition.

        Filtering can be done by either `rows` or `colname_value_pairs`. `rows`
        can be either a boolean vector or a function that receives the data
        frame as argument and returns a boolean vector. The latter is
        especially useful in a method chaining context where you don't have
        direct access to the data frame in question. Alternatively,
        `colname_value_pairs` provides a shorthand to check against a fixed
        value. See the example below of equivalent filtering all three ways.

        >>> data = di.read_csv("data/listings.csv")
        >>> data.filter_out(data.hood == "Manhattan")
        >>> data.filter_out(lambda x: x.hood == "Manhattan")
        >>> data.filter_out(hood="Manhattan")
        """
        if rows is not None:
            if callable(rows):
                rows = rows(self)
        elif colname_value_pairs:
            rows = Vector.fast([True], bool).repeat(self.nrow)
            for colname, value in colname_value_pairs.items():
                rows = rows & (self[colname] == value)
        rows = self._parse_rows_from_boolean(rows)
        for colname, column in self.items():
            yield colname, np.delete(column, rows)
示例#4
0
 def get_part(data, colname):
     if colname in data:
         return data[colname]
     for ref in data_frames:
         if colname not in ref: continue
         value = ref[colname].na_value
         dtype = ref[colname].na_dtype
         return Vector.fast([value], dtype).repeat(data.nrow)
示例#5
0
 def _parse_rows_from_integer(self, rows):
     return Vector.fast(rows, int)
示例#6
0
 def _parse_rows_from_boolean(self, rows):
     rows = Vector.fast(rows, bool)
     if len(rows) != self.nrow:
         raise ValueError("Bad length for boolean rows")
     return Vector.fast(np.nonzero(rows)[0], int)
示例#7
0
 def _parse_cols_from_integer(self, cols):
     return Vector.fast(cols, int)
示例#8
0
 def _parse_cols_from_boolean(self, cols):
     cols = Vector.fast(cols, bool)
     if len(cols) != self.ncol:
         raise ValueError("Bad length for boolean cols")
     return Vector.fast(np.nonzero(cols)[0], int)
示例#9
0
 def to_string(self, *, max_rows=None, max_width=None):
     geometry = [f"<{x['type']}>" for x in self.geometry]
     data = self.modify(geometry=Vector.fast(geometry, object))
     return DataFrame.to_string(data, max_rows, max_width)
示例#10
0
 def test_fast(self):
     a = Vector.fast([1, 2, 3], int)
     b = Vector([1, 2, 3], int)
     assert a.is_integer()
     assert a.equal(b)