예제 #1
0
파일: functions.py 프로젝트: wwwK/koalas
def percentile_approx(col, percentage, accuracy=10000):
    """
    Returns the approximate percentile value of numeric column col at the given percentage.
    The value of percentage must be between 0.0 and 1.0.

    The accuracy parameter (default: 10000)
    is a positive numeric literal which controls approximation accuracy at the cost of memory.
    Higher value of accuracy yields better accuracy, 1.0/accuracy is the relative error
    of the approximation.

    When percentage is an array, each value of the percentage array must be between 0.0 and 1.0.
    In this case, returns the approximate percentile array of column col
    at the given percentage array.

    Ported from Spark 3.1.
    """
    sc = SparkContext._active_spark_context

    if isinstance(percentage, (list, tuple)):
        # A local list
        percentage = sc._jvm.functions.array(
            _to_seq(sc, [_create_column_from_literal(x) for x in percentage]))
    elif isinstance(percentage, Column):
        # Already a Column
        percentage = _to_java_column(percentage)
    else:
        # Probably scalar
        percentage = _create_column_from_literal(percentage)

    accuracy = (_to_java_column(accuracy) if isinstance(accuracy, Column) else
                _create_column_from_literal(accuracy))

    return _call_udf(sc, "percentile_approx", _to_java_column(col), percentage,
                     accuracy)
예제 #2
0
파일: functions.py 프로젝트: zero323/spark
def repeat(col: Column, n: Union[int, Column]) -> Column:
    """
    Repeats a string column n times, and returns it as a new string column.
    """
    sc = SparkContext._active_spark_context
    n = _to_java_column(n) if isinstance(n, Column) else _create_column_from_literal(n)
    return _call_udf(sc, "repeat", _to_java_column(col), n)
예제 #3
0
def date_part(field: Union[str, Column], source: Column) -> Column:
    """
    Extracts a part of the date/timestamp or interval source.
    """
    sc = SparkContext._active_spark_context
    field = (_to_java_column(field) if isinstance(field, Column) else
             _create_column_from_literal(field))
    return _call_udf(sc, "date_part", field, _to_java_column(source))
예제 #4
0
def hlike(col, regexps):
    """Hyperscan regex like. Returns true if col matches one of regexps
    :param col: Column
    :param regexps: list of patterns to match
    :return: boolean column with match result
    """
    sc = SparkContext._active_spark_context
    patterns = sc._jvm.functions.array(_to_seq(sc, [
        _create_column_from_literal(x) for x in regexps
    ]))
    return Column(sc._jvm.ru.napalabs.spark.hscan.functions.hlike(_to_java_column(col), patterns))
예제 #5
0
 def _(col, other):
     # convert other to a Row if necessary
     jcol = col._jc
     sc = SparkContext._active_spark_context
     loader = sc._jvm.Thread.currentThread().getContextClassLoader()
     wclass = loader.loadClass(name)
     expr_class = sc._jvm.java.lang.Object
     expr_array = sc._gateway.new_array(expr_class, 2)
     expr_array[0] = jcol.expr()
     expr_array[1] = _create_column_from_literal(other)
     w = wclass.getConstructors()[0].newInstance(expr_array)
     wcol = sc._jvm.org.apache.spark.sql.Column(w)
     return Column(wcol)
예제 #6
0
 def _(col, other):
     # convert other to a Row if necessary
     jcol = col._jc
     sc = SparkContext._active_spark_context
     loader = sc._jvm.Thread.currentThread().getContextClassLoader()
     wclass = loader.loadClass(name)
     expr_class = sc._jvm.java.lang.Object
     expr_array = sc._gateway.new_array(expr_class, 2)
     expr_array[0] = jcol.expr()
     expr_array[1] = _create_column_from_literal(other)
     w = wclass.getConstructors()[0].newInstance(expr_array)
     wcol = sc._jvm.org.apache.spark.sql.Column(w)
     return Column(wcol)