Exemplo n.º 1
0
    def var(
        self,
        axis=None,
        skipna=True,
        ddof=1,
        split_every=False,
        dtype=None,
        out=None,
    ):
        axis = self._validate_axis(axis)
        meta = self._meta_nonempty.var(axis=axis, skipna=skipna)
        if axis == 1:
            result = map_partitions(
                M.var,
                self,
                meta=meta,
                token=self._token_prefix + "var",
                axis=axis,
                skipna=skipna,
                ddof=ddof,
            )
            return handle_out(out, result)

        else:
            num = self._get_numeric_data()
            x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
            x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every)
            n = num.count(split_every=split_every)
            name = self._token_prefix + "var"
            result = map_partitions(
                var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof
            )
            if isinstance(self, DataFrame):
                result.divisions = (min(self.columns), max(self.columns))
            return handle_out(out, result)
Exemplo n.º 2
0
Arquivo: core.py Projeto: rongou/cudf
 def var(
     self,
     axis=None,
     skipna=True,
     ddof=1,
     split_every=False,
     dtype=None,
     out=None,
     naive=False,
 ):
     axis = self._validate_axis(axis)
     meta = self._meta_nonempty.var(axis=axis, skipna=skipna)
     if axis == 1:
         result = map_partitions(
             M.var,
             self,
             meta=meta,
             token=self._token_prefix + "var",
             axis=axis,
             skipna=skipna,
             ddof=ddof,
         )
         return handle_out(out, result)
     elif naive:
         return _naive_var(self, meta, skipna, ddof, split_every, out)
     else:
         return _parallel_var(self, meta, skipna, split_every, out)
Exemplo n.º 3
0
def _naive_var(ddf, meta, skipna, ddof, split_every, out):
    num = ddf._get_numeric_data()
    x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
    x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every)
    n = num.count(split_every=split_every)
    name = ddf._token_prefix + "var"
    result = map_partitions(
        var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof
    )
    if isinstance(ddf, DataFrame):
        result.divisions = (min(ddf.columns), max(ddf.columns))
    return handle_out(out, result)
Exemplo n.º 4
0
Arquivo: core.py Projeto: rongou/cudf
def _parallel_var(ddf, meta, skipna, split_every, out):
    def _local_var(x, skipna):
        if skipna:
            n = x.count(skipna=skipna)
            avg = x.mean(skipna=skipna)
        else:
            # Not skipping nulls, so might as well
            # avoid the full `count` operation
            n = len(x)
            avg = x.sum(skipna=skipna) / n
        m2 = ((x - avg)**2).sum(skipna=skipna)
        return n, avg, m2

    def _aggregate_var(parts):
        n, avg, m2 = parts[0]
        for i in range(1, len(parts)):
            n_a, avg_a, m2_a = n, avg, m2
            n_b, avg_b, m2_b = parts[i]
            n = n_a + n_b
            avg = (n_a * avg_a + n_b * avg_b) / n
            delta = avg_b - avg_a
            m2 = m2_a + m2_b + delta**2 * n_a * n_b / n
        return n, avg, m2

    def _finalize_var(vals):
        n, _, m2 = vals
        return m2 / (n - 1)

    # Build graph
    nparts = ddf.npartitions
    if not split_every:
        split_every = nparts
    name = "var-" + tokenize(skipna, split_every, out)
    local_name = "local-" + name
    num = ddf._get_numeric_data()
    dsk = {(local_name, n, 0): (_local_var, (num._name, n), skipna)
           for n in range(nparts)}

    # Use reduction tree
    widths = [nparts]
    while nparts > 1:
        nparts = math.ceil(nparts / split_every)
        widths.append(nparts)
    height = len(widths)
    for depth in range(1, height):
        for group in range(widths[depth]):
            p_max = widths[depth - 1]
            lstart = split_every * group
            lstop = min(lstart + split_every, p_max)
            node_list = [(local_name, p, depth - 1)
                         for p in range(lstart, lstop)]
            dsk[(local_name, group, depth)] = (_aggregate_var, node_list)
    if height == 1:
        group = depth = 0
    dsk[(name, 0)] = (_finalize_var, (local_name, group, depth))

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf])
    result = dd.core.new_dd_object(graph, name, meta, (None, None))
    if isinstance(ddf, DataFrame):
        result.divisions = (min(ddf.columns), max(ddf.columns))
    return handle_out(out, result)