示例#1
0
    def compare(self, other, *by, ignore_columns=[], max_changed=inf):
        """
        Find differences against another data frame.

        `by` are identifier columns which are used to uniquely identify rows
        and match them between `self` and `other`. `compare` will not work if
        your data lacks suitable identifiers. `ignore_columns` is an optional
        list of columns, differences in which to ignore.

        `compare` returns three data frames: added rows, removed rows and
        changed values. The first two are basically subsets of the rows of
        `self` and `other`, respectively. Changed values are returned as a data
        frame with one row per differing value (not per differing row). Listing
        changes will terminate once `max_changed` is reached.

        .. warning:: `compare` is experimental, do not rely on it reporting all
                     of the differences correctly. Do not try to give it two
                     huge data frames with very little in common, unless also
                     giving some sensible value for `max_changed`.

        >>> old = di.read_csv("data/vehicles.csv")
        >>> new = old.modify(hwy=lambda x: np.minimum(100, x.hwy))
        >>> added, removed, changed = new.compare(old, "id")
        >>> changed
        """
        if self.unique(*by).nrow < self.nrow:
            raise ValueError(f"self not unique by {by}")
        if other.unique(*by).nrow < other.nrow:
            raise ValueError(f"other not unique by {by}")
        added = self.anti_join(other, *by)
        removed = other.anti_join(self, *by)
        x = self.modify(_i_=range(self.nrow))
        y = other.modify(_j_=range(other.nrow))
        z = x.inner_join(y.select("_j_", *by), *by)
        colnames = util.unique_keys(self.colnames + other.colnames)
        colnames = [x for x in colnames if x not in ignore_columns]
        changed = []
        for i, j in zip(z._i_, z._j_):
            if len(changed) >= max_changed:
                print(f"max_changed={max_changed} reached, terminating")
                break
            for colname in colnames:
                if len(changed) >= max_changed: break
                # XXX: How to make a distinction between
                # a missing column and a missing value?
                xvalue = x[colname][i] if colname in x else None
                yvalue = y[colname][j] if colname in y else None
                if (xvalue != yvalue and
                    not Vector([xvalue, yvalue]).is_na().all()):
                    # XXX: We could have a name clash here.
                    byrow = {k: x[k][i] for k in by}
                    changed.append(dict(**byrow,
                                        column=colname,
                                        xvalue=xvalue,
                                        yvalue=yvalue))

        added = added if added.nrow > 0 else None
        removed = removed if removed.nrow > 0 else None
        changed = self.from_json(changed) if changed else None
        return added, removed, changed
示例#2
0
    def print_na_counts(self):
        """
        Print counts of missing values by key.

        Both keys entirely missing and keys with a value of ``None`` are
        considered missing.

        >>> data = di.read_json("data/listings.json")
        >>> data.print_na_counts()
        """
        print("Missing counts:")
        for key in util.unique_keys(itertools.chain(*self)):
            n = sum(x.get(key, None) is None for x in self)
            if n == 0: continue
            pc = 100 * n / len(self)
            print(f"... {key}: {n} ({pc:.1f}%)")
示例#3
0
    def fill_missing_keys(self, **key_value_pairs):
        """
        Return list with missing keys added.

        If `key_value_pairs` not given, fill all missing keys with ``None``.

        >>> data = di.read_json("data/listings.json")
        >>> data = data.fill_missing_keys(price=None)
        >>> data = data.fill_missing_keys()
        """
        if not key_value_pairs:
            keys = util.unique_keys(itertools.chain(*self))
            key_value_pairs = dict.fromkeys(keys, None)
        key_value_pairs = key_value_pairs.items()
        for item in self:
            for key, value in key_value_pairs:
                if key not in item:
                    item[key] = value
            yield item
示例#4
0
    def from_json(cls, string, *, columns=[], dtypes={}, **kwargs):
        """
        Return a new data frame from JSON `string`.

        `columns` is an optional list of columns to limit to. `dtypes` is an
        optional dict mapping column names to NumPy datatypes. `kwargs` are
        passed to ``json.load``.
        """
        data = string
        if isinstance(data, str):
            data = json.loads(data, **kwargs)
        if not isinstance(data, list):
            raise TypeError("Not a list")
        keys = util.unique_keys(itertools.chain(*data))
        if columns:
            keys = [x for x in keys if x in columns]
        data = {k: [x.get(k, None) for x in data] for k in keys}
        for name, dtype in dtypes.items():
            data[name] = DataFrameColumn(data[name], dtype)
        return cls(**data)
示例#5
0
    def rbind(self, *others):
        """
        Return data frame with rows from `others` added.

        >>> data = di.read_csv("data/listings.csv")
        >>> data.rbind(data)
        """
        data_frames = [self] + list(others)
        colnames = util.unique_keys(itertools.chain(*data_frames))
        def get_part(data, colname):
            if colname in data:
                return data[colname]
            for ref in data_frames:
                if colname not in ref: continue
                value = ref[colname].na_value
                dtype = ref[colname].na_dtype
                return Vector.fast([value], dtype).repeat(data.nrow)
        for colname in colnames:
            parts = [get_part(x, colname) for x in data_frames]
            total = DataFrameColumn(np.concatenate(parts))
            yield colname, total
示例#6
0
    def write_csv(self, path, *, encoding="utf-8", header=True, sep=","):
        """
        Write list to CSV file `path`.

        Will automatically compress if `path` ends in ``.bz2|.gz|.xz``.
        """
        if not self:
            raise ValueError("Cannot write empty CSV file")
        # Take a superset of all keys.
        keys = util.unique_keys(itertools.chain(*self))
        util.makedirs_for_file(path)
        with util.xopen(path, "wt", encoding=encoding) as f:
            writer = csv.DictWriter(f,
                                    keys,
                                    dialect="unix",
                                    delimiter=sep,
                                    quoting=csv.QUOTE_MINIMAL)

            writer.writeheader() if header else None
            for item in self:
                # Fill in missing as None.
                item = {**dict.fromkeys(keys), **item}
                writer.writerow(item)
示例#7
0
 def test_unique_keys(self):
     assert util.unique_keys([1, 2, 3]) == [1, 2, 3]
     assert util.unique_keys([1, 2, 3, 1]) == [1, 2, 3]