示例#1
0
文件: utils.py 项目: yliou/spark
def compare_null_last(
    left: Column,
    right: Column,
    comp: Callable[[Column, Column], Column],
) -> Column:
    return (left.isNotNull() & right.isNotNull()
            & comp(left, right)) | (left.isNotNull() & right.isNull())
示例#2
0
def compare_null_last(
    left: spark.Column,
    right: spark.Column,
    comp: Callable[[spark.Column, spark.Column], spark.Column],
) -> spark.Column:
    return (left.isNotNull() & right.isNotNull()
            & comp(left, right)) | (left.isNotNull() & right.isNull())
示例#3
0
 def flattenProductHierarchyRecursive(df):
     # "explode" function creates a new row for each element in the given array or map column (in a DataFrame).
     if df.select(explode('categories')).count() <= 0:
         return df.select('parentId', 'childId', 'friendlyName')
     else:
         dfR = df.select('childId',explode('categories').alias('CatArray'))\
             .select(Column('childId').alias('parentId'), Column('CatArray.id').alias('childId'), Column('CatArray.friendlyName').alias('friendlyName'), Column('CatArray.categories').alias('categories'))
     return df.select('parentId', 'childId', 'friendlyName')\
         .union(flattenProductHierarchyRecursive(dfR).select('parentId', 'childId', 'friendlyName'))
示例#4
0
 def temporal_key_column(self) -> Column:
     """
     Fetch the temporal key column, if any.
     :return: Temporal key column, or None.
     """
     col = self._jrfctx.temporalKeyColumn(self._jdf)
     return col and Column(col)
示例#5
0
 def spatial_key_column(self) -> Column:
     """
     Fetch the tagged spatial key column.
     :return: Spatial key column
     """
     col = self._jrfctx.spatialKeyColumn(self._jdf)
     return Column(col)
示例#6
0
 def tile_columns(self) -> List[Column]:
     """
     Fetches columns of type Tile.
     :return: One or more Column instances associated with Tiles.
     """
     cols = self._jrfctx.tileColumns(self._jdf)
     return [Column(c) for c in cols]
示例#7
0
def at_least_n_distinct(col, limit):
    """Count distinct that works with windows

    The standard distinct count in spark sql can't be applied in
    a window. This implementation allows that to work
    """
    sc = SparkContext._active_spark_context
    j_cols = _to_seq(sc, [_to_java_column(col), _to_java_column(F.lit(limit))])
    jc = sc._jvm.org.wikimedia.search.mjolnir.AtLeastNDistinct().apply(j_cols)
    return Column(jc)
示例#8
0
def __withField(self: Column, fieldName: str, fieldValue: Column):
    """
    An expression that adds/replaces a field by name in a `StructType`.
    If schema contains multiple fields with fieldName, they will all be replaced with fieldValue.
    """
    sc = SparkContext._active_spark_context
    _columnWithCustomMethods = sc._jvm.com.github.fqaiser94.mse.methods.ColumnWithCustomMethods(
        self._jc)
    _column = _columnWithCustomMethods.withField(fieldName, fieldValue._jc)
    return Column(_column)
示例#9
0
    def generate_uuid(self):
        """ Generate V4 UUID.

        Returns:
            Spark Column (StringType): containing v4 UUIDs.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _generate_uuid = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.generateUUID_UDF()
        return Column(_generate_uuid.apply(_to_seq(sc, [], _to_java_column)))
示例#10
0
def with_meta(self, alias, meta):
    """
    In pyspark 2.1 there is no simple way to change the metdata of a column, that only became available in pyspark 2.2.
    This is a function that takes a column and modifies its metadata.
    :param self: A pyspark column
    :param alias:
    :param meta: New meta data for the column
    """
    sc = SparkContext._active_spark_context
    jmeta = sc._gateway.jvm.org.apache.spark.sql.types.Metadata
    return Column(getattr(self._jc, "as")(alias, jmeta.fromJson(json.dumps(meta))))
示例#11
0
 def _(*cols):
     jcontainer = self.get_java_container(
         package_name=package_name,
         object_name=object_name,
         java_class_instance=java_class_instance)
     # Ensure that your argument is a column
     function = getattr(jcontainer, name)
     judf = function()
     jc = judf.apply(
         self.to_scala_seq([_to_java_column(c) for c in cols]))
     return Column(jc)
示例#12
0
def __dropFields(self: Column, *fieldNames: str):
    """
    An expression that drops fields by name in a `StructType`.
    This is a no-op if schema doesn't contain given field names.
    If schema contains multiple fields matching any one of the given fieldNames, they will all be dropped.
    """
    sc = SparkContext._active_spark_context
    _columnWithCustomMethods = sc._jvm.com.github.fqaiser94.mse.methods.ColumnWithCustomMethods(
        self._jc)
    _fieldNames = sc._jvm.PythonUtils.toSeq(fieldNames)
    _column = _columnWithCustomMethods.dropFields(_fieldNames)
    return Column(_column)
示例#13
0
    def clean_string(self, target_col):
        """ Remove Java ISO control characters from, and trim, string.

        Args:
            target_col (Spark Column): target column to be cleaned.

        Returns:
            Spark Column (StringType): cleaned version of input column.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _clean_string = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.cleanString_UDF()
        return Column(_clean_string.apply(_to_seq(sc, [target_col], _to_java_column)))
示例#14
0
    def normalize_date_md(self, target_col):
        """ Convert string to date where MONTH is BEFORE DAY.

        Args:
            target_col (Spark Column): containing strings representing dates.

        Returns:
            Spark Column (DateType): containing dates converted from strings.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _normalize_date_md = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeDateMD_UDF()
        return Column(_normalize_date_md.apply(_to_seq(sc, [target_col], _to_java_column)))
示例#15
0
    def string_to_double_cfd(self, target_col):
        """ Convert string to doubles where commas represents decimal places (`cfd`).

        Args:
            target_col (Spark Column): containing double values in string format.

        Returns:
            Spark Column (DoubleType): containing double values converted from strings.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _string_to_double = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringToDoubleCommaForDecimal_UDF()
        return Column(_string_to_double.apply(_to_seq(sc, [target_col], _to_java_column)))
示例#16
0
    def map_booleans_ynu(self, target_col):
        """ Map boolean values to `Y`, `N`, `Unknown`

        Args:
            target_col (Spark Column): target column containing boolean values to map.

        Returns:
            Spark Column (StringType): mapped values (`Y`, `N`, `Unknown`)
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _map_booleans_ynu = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.mapBooleansYNU_UDF()
        return Column(_map_booleans_ynu.apply(_to_seq(sc, [target_col], _to_java_column)))
示例#17
0
    def normalize_timestamp_dm(self, target_col):
        """ Convert string to timestamp where DAY is BEFORE MONTH.

        Args:
            target_col (Spark Column): containing strings representing timestamps.

        Returns:
            Spark Column (TimestampType): containing timestamps converted from strings.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _normalize_timestamp_dm = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeTimestampDM_UDF()
        return Column(_normalize_timestamp_dm.apply(_to_seq(sc, [target_col], _to_java_column)))
示例#18
0
    def empty_string_to_null(self, target_col):
        """ Convert empty strings to nulls.

        Args:
            target_col (Spark Column): target column to convert.

        Returns:
            Spark Column (StringType): target column with empty values converted to nulls.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _empty_string_to_null = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.emptyStringToNull_UDF()
        return Column(_empty_string_to_null.apply(_to_seq(sc, [target_col], _to_java_column)))
示例#19
0
    def string_is_number(self, target_col):
        """ Return boolean if string can be converted to a number.

        Args:
            target_col (Spark Column): containing string to check for convertability to number.

        Returns:
            Spark Column (BooleanType): whether string can converted to a number.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _string_is_number = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringIsNumber_UDF()
        return Column(_string_is_number.apply(_to_seq(sc, [target_col], _to_java_column)))
示例#20
0
def __withFieldRenamed(self: Column, existingFieldName: str,
                       newFieldName: str):
    """
    An expression that renames a field by name in a `StructType`.
    This is a no-op if schema doesn't contain any field with existingFieldName.
    If schema contains multiple fields with existingFieldName, they will all be renamed to newFieldName.
    """
    sc = SparkContext._active_spark_context
    _columnWithCustomMethods = sc._jvm.com.github.fqaiser94.mse.methods.ColumnWithCustomMethods(
        self._jc)
    _column = _columnWithCustomMethods.withFieldRenamed(
        existingFieldName, newFieldName)
    return Column(_column)
示例#21
0
def add_struct_field(nestedStruct: str, fieldName: str, fieldValue: Column):
    """
    A convenience method for adding/replacing a field by name inside a deeply nested struct.

    :param nestedStruct : e.g. "a.b.c" where a, b, and c are StructType columns and a is a top-level StructType Column and c is the StructType Column to add/replace field in.
    :param fieldName    : The name of the StructField to add (if it does not already exist) or replace (if it already exists).
    :param fieldValue   : The value to assign to fieldName.
    :return: a copy the top-level struct column (a) with field added/replaced.
    """
    sc = SparkContext._active_spark_context
    _add_struct_field = sc._jvm.com.github.fqaiser94.mse.methods.add_struct_field
    _column = _add_struct_field(nestedStruct, fieldName, fieldValue._jc)
    return Column(_column)
示例#22
0
 def _(*cols):
     jcontainer = self.get_java_container(
         package_name=package_name,
         object_name=object_name,
         java_class_instance=java_class_instance)
     # Ensure that your argument is a column
     col_args = [
         col._jc if isinstance(col, Column) else _make_col(col)._jc
         for col in cols
     ]
     function = getattr(jcontainer, name)
     args = col_args
     jc = function(*args)
     return Column(jc)
示例#23
0
def add_meta(sc, col, metadata):
    """Add metadata to a column

    Adds metadata to a column for describing extra properties. This metadata survives
    serialization from dataframe to parquet and back to dataframe. Any manipulation
    of the column, such as aliasing, will lose the metadata.

    Parameters
    ----------
    sc : pyspark.SparkContext
    col : pyspark.sql.Column
    metadata : dict

    Returns
    -------
    pyspark.sql.Column
    """
    meta = sc._jvm.org.apache.spark.sql.types \
        .Metadata.fromJson(json.dumps(metadata))
    return Column(getattr(col._jc, 'as')('', meta))
示例#24
0
 def pow_func(left, right):
     return F.when(left == 1, left).otherwise(Column.__pow__(left, right))
示例#25
0
 def rpow_func(left, right):
     return F.when(F.lit(right == 1), right).otherwise(Column.__rpow__(left, right))
示例#26
0
文件: utils.py 项目: jerqi/spark
def compare_allow_null(
    left: Column,
    right: Column,
    comp: Callable[[Column, Column], Column],
) -> Column:
    return left.isNull() | right.isNull() | comp(left, right)
示例#27
0
文件: utils.py 项目: jerqi/spark
def compare_disallow_null(
    left: Column,
    right: Column,
    comp: Callable[[Column, Column], Column],
) -> Column:
    return left.isNotNull() & right.isNotNull() & comp(left, right)
示例#28
0
def compare_allow_null(
    left: spark.Column,
    right: spark.Column,
    comp: Callable[[spark.Column, spark.Column], spark.Column],
) -> spark.Column:
    return left.isNull() | right.isNull() | comp(left, right)
示例#29
0
def compare_disallow_null(
    left: spark.Column,
    right: spark.Column,
    comp: Callable[[spark.Column, spark.Column], spark.Column],
) -> spark.Column:
    return left.isNotNull() & right.isNotNull() & comp(left, right)
示例#30
0
    # Convert the rdd of Categories into a Data Frame
    df = sqlContext.read.json(rdd)

    # This will print the Schema of the Categories Object
    df.printSchema()

    # Displays the contents of the Categories Object
    df.show()
    # from pyspark.sql.functions import split, explode
    from pyspark.sql.functions import *
    df.select(explode('categories')).show()

    from pyspark.sql import Row, Column
    # Select few columns in the JSON file
    dfRoot = df.select(
        Column('id').alias('parentId'),
        Column('id').alias('childId'), 'friendlyName', 'categories')

    # Display the results of the selected Columns
    dfRoot.show()

    from pyspark.sql.functions import *

    # Recursive function to assign the Parent and Child Ids appropritely
    def flattenProductHierarchyRecursive(df):
        # "explode" function creates a new row for each element in the given array or map column (in a DataFrame).
        if df.select(explode('categories')).count() <= 0:
            return df.select('parentId', 'childId', 'friendlyName')
        else:
            dfR = df.select('childId',explode('categories').alias('CatArray'))\
                .select(Column('childId').alias('parentId'), Column('CatArray.id').alias('childId'), Column('CatArray.friendlyName').alias('friendlyName'), Column('CatArray.categories').alias('categories'))