示例#1
0
 def get_last_month(col):
     h = F.abs(F.xxhash64(col))
     h1 = (h.bitwiseAND(0xff)) % (MAX_MONTH // 2)
     h2 = (F.shiftRight(h, 8).bitwiseAND(0xff)) % (MAX_MONTH // 3)
     h3 = (F.shiftRight(h, 16).bitwiseAND(0xff)) % (MAX_MONTH // 5)
     h4 = (F.shiftRight(h, 24).bitwiseAND(0xff)) % (MAX_MONTH // 7)
     h5 = (F.shiftRight(h, 32).bitwiseAND(0xff)) % (MAX_MONTH // 11)
     return -(h1 + h2 + h3 + h4 + h5)
示例#2
0
 def test_shiftright(self):
     self.spark.range(10).select(
         assert_true(shiftRight(col("id"), 2) == shiftright(col("id"),
                                                            2))).collect()
示例#3
0
def healpix_hist(input_df, NSIDE=64, groupby=[],
                 agg={"*": "count"}, returnDf=False):
    from pyspark.sql.functions import floor as FLOOR, col as COL, lit, shiftRight

    order0 = 12
    order  = hp.nside2order(NSIDE)
    shr    = 2*(order0 - order)

    # construct query
    df = input_df.withColumn('hpix__', shiftRight('hpix12', shr))
    gbcols = ('hpix__', )
    for axspec in groupby:
        if not isinstance(axspec, str):
            (col, c0, c1, dc) = axspec
            df = ( df
                .where((lit(c0) < COL(col)) & (COL(col) < lit(c1)))
                .withColumn(col + '_bin__', FLOOR((COL(col) - lit(c0)) / lit(dc)) * lit(dc) + lit(c0) )
                 )
            gbcols += ( col + '_bin__', )
        else:
            gbcols += ( axspec, )
    df = df.groupBy(*gbcols)

    # execute aggregation
    df = df.agg(agg)

    # fetch result
    df = df.toPandas()
    if returnDf:
        return df

    # repack the result into maps
    # This results line is slightly dangerous, because some aggregate functions are purely aliases.
    # E.g., mean(x) gets returned as a column avg(x).
    results = [ f"{v}({k})" if k != "*" else f"{v}(1)" for k, v in agg.items() ]    # Result columns
    def _create_map(df):
        maps = dict()
        for val in results:
            map_ = np.zeros(hp.nside2npix(NSIDE))
            # I think this line throws an error if there are no rows in the result
            map_[df.hpix__.values] = df[val].values 
            maps[val] = [ map_ ]
        return pd.DataFrame(data=maps)

    idxcols = list(gbcols[1:])
    if len(idxcols) == 0:
        ret = _create_map(df)
        assert(len(ret) == 1)
        if not returnDf:
            # convert to tuple, or scalar
            ret = tuple(ret[name].values[0] for name in results)
            if len(ret) == 1:
                ret = ret[0]
    else:
        ret = df.groupby(idxcols).apply(_create_map)
        ret.index = ret.index.droplevel(-1)
        ret.index.rename([ name.split("_bin__")[0] for name in ret.index.names ], inplace=True)
        if "count(1)" in ret:
                    ret = ret.rename(columns={'count(1)': 'count'})
        if not returnDf:
            if len(ret.columns) == 1:
                ret = ret.iloc[:, 0]
    return ret
示例#4
0
def tocolumns(df, expr):
    import pyspark.sql.functions as fcns

    if isinstance(expr, histbook.expr.Const):
        return fcns.lit(expr.value)

    elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)):
        return df[expr.value]

    elif isinstance(expr, histbook.expr.Call):
        if expr.fcn == "abs" or expr.fcn == "fabs":
            return fcns.abs(tocolumns(df, expr.args[0]))
        elif expr.fcn == "max" or expr.fcn == "fmax":
            return fcns.greatest(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "min" or expr.fcn == "fmin":
            return fcns.least(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "arccos":
            return fcns.acos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arccosh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arcsin":
            return fcns.asin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arcsinh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arctan2":
            return fcns.atan2(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "arctan":
            return fcns.atan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arctanh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "ceil":
            return fcns.ceil(tocolumns(df, expr.args[0]))
        elif expr.fcn == "copysign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "cos":
            return fcns.cos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "cosh":
            return fcns.cosh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "rad2deg":
            return tocolumns(df, expr.args[0]) * (180.0 / math.pi)
        elif expr.fcn == "erfc":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "erf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp":
            return fcns.exp(tocolumns(df, expr.args[0]))
        elif expr.fcn == "expm1":
            return fcns.expm1(tocolumns(df, expr.args[0]))
        elif expr.fcn == "factorial":
            return fcns.factorial(tocolumns(df, expr.args[0]))
        elif expr.fcn == "floor":
            return fcns.floor(tocolumns(df, expr.args[0]))
        elif expr.fcn == "fmod":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "gamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "hypot":
            return fcns.hypot(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "isinf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isnan":
            return fcns.isnan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "lgamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "log10":
            return fcns.log10(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log1p":
            return fcns.log1p(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log":
            return fcns.log(tocolumns(df, expr.args[0]))
        elif expr.fcn == "pow":
            return fcns.pow(tocolumns(df, expr.args[0]),
                            tocolumns(df, expr.args[1]))
        elif expr.fcn == "deg2rad":
            return tocolumns(df, expr.args[0]) * (math.pi / 180.0)
        elif expr.fcn == "sinh":
            return fcns.sinh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sin":
            return fcns.sin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sqrt":
            return fcns.sqrt(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tanh":
            return fcns.tanh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tan":
            return fcns.tan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "trunc":
            raise NotImplementedError(
                expr.fcn)  # FIXME (fcns.trunc is for dates)
        elif expr.fcn == "xor":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "conjugate":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "heaviside":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isfinite":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "left_shift" and isinstance(expr.args[1],
                                                     histbook.expr.Const):
            return fcns.shiftLeft(tocolumns(df, expr.args[0]),
                                  expr.args[1].value)
        elif expr.fcn == "log2":
            return fcns.log2(tocolumns(df, expr.args[0]))
        elif expr.fcn == "logaddexp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "logaddexp":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "mod" or expr.fcn == "fmod":
            return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1])
        elif expr.fcn == "right_shift" and isinstance(expr.args[1],
                                                      histbook.expr.Const):
            return fcns.shiftRight(tocolumns(df, expr.args[0]),
                                   expr.args[1].value)
        elif expr.fcn == "rint":
            return fcns.rint(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "where":
            return fcns.when(tocolumns(df, expr.args[0]),
                             tocolumns(df, expr.args[1])).otherwise(
                                 tocolumns(df, expr.args[2]))
        elif expr.fcn == "numpy.equal":
            return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.not_equal":
            return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less":
            return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less_equal":
            return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.isin":
            return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_not":
            return ~tocolumns(df, expr.args[0])
        elif expr.fcn == "numpy.add":
            return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.subtract":
            return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.multiply":
            return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.true_divide":
            return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_or":
            return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_and":
            return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1])
        else:
            raise NotImplementedError(expr.fcn)

    else:
        raise AssertionError(expr)