Exemplo n.º 1
0
def map_annotations(f, output_type: DataType):
    """Creates a Spark UDF to map over an Annotator's results.

    Parameters
    ----------
    f : function
        The function to be applied over the results
    output_type : :class:`pyspark.sql.types.DataType`
        Output type of the data

    Returns
    -------
    :func:`pyspark.sql.functions.udf`
        Spark UserDefinedFunction (udf)

    Examples
    --------
    >>> from sparknlp.pretrained import PretrainedPipeline
    >>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
    >>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
    >>> result = explain_document_pipeline.transform(data)

    The array type must be provided in order to tell Spark the expected output
    type of our column. We are using an Annotation array here.

    >>> from sparknlp.functions import *
    >>> def nnp_tokens(annotations: List[Row]):
    ...     return list(
    ...         filter(lambda annotation: annotation.result == 'NNP', annotations)
    ...     )
    >>> result.select(
    ...     map_annotations(nnp_tokens, Annotation.arrayType())('pos').alias("nnp")
    ... ).selectExpr("explode(nnp) as nnp").show(truncate=False)
    +-----------------------------------------+
    |nnp                                      |
    +-----------------------------------------+
    |[pos, 0, 2, NNP, [word -> U.N], []]      |
    |[pos, 14, 18, NNP, [word -> Epeus], []]  |
    |[pos, 30, 36, NNP, [word -> Baghdad], []]|
    +-----------------------------------------+
    """
    return udf(
        lambda content: [
            Annotation.toRow(a)
            for a in f([Annotation.fromRow(r) for r in content])
        ], output_type)
Exemplo n.º 2
0
def map_annotations_array(f, output_type: DataType):
    """Creates a Spark UDF to map over an Annotator's array results.

    Parameters
    ----------
    f : function
        The function to be applied over the results
    output_type : :class:`pyspark.sql.types.DataType`
        Output type of the data

    Returns
    -------
    :func:`pyspark.sql.functions.udf`
        Spark UserDefinedFunction (udf)
    """
    return udf(
        lambda cols: [
            Annotation.toRow(item)
            for item in f([Annotation.fromRow(r) for col in cols for r in col])
        ], output_type)
Exemplo n.º 3
0
def map_annotations_strict(f):
    """Creates a Spark UDF to map over an Annotator's results, for which the
    return type is explicitly defined as a `Annotation.dataType()`.

    Parameters
    ----------
    f : function
        The function to be applied over the results

    Returns
    -------
    :func:`pyspark.sql.functions.udf`
        Spark UserDefinedFunction (udf)

    Examples
    --------
    >>> from sparknlp.pretrained import PretrainedPipeline
    >>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
    >>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
    >>> result = explain_document_pipeline.transform(data)
    >>> def nnp_tokens(annotations):
    ...     return list(
    ...         filter(lambda annotation: annotation.result == 'NNP', annotations)
    ...     )
    >>> result.select(
    ...     map_annotations_strict(nnp_tokens)('pos').alias("nnp")
    ... ).selectExpr("explode(nnp) as nnp").show(truncate=False)
    +-----------------------------------------+
    |nnp                                      |
    +-----------------------------------------+
    |[pos, 0, 2, NNP, [word -> U.N], []]      |
    |[pos, 14, 18, NNP, [word -> Epeus], []]  |
    |[pos, 30, 36, NNP, [word -> Baghdad], []]|
    +-----------------------------------------+
    """
    return udf(
        lambda content: [
            Annotation.toRow(a)
            for a in f([Annotation.fromRow(r) for r in content])
        ], ArrayType(Annotation.dataType()))
Exemplo n.º 4
0
def map_annotations_strict(f):
    return udf(
        lambda content: [
            Annotation.toRow(a)
            for a in f([Annotation.fromRow(r) for r in content])
        ], ArrayType(Annotation.dataType()))
Exemplo n.º 5
0
def map_annotations_array(f, output_type: DataType):
    return udf(
        lambda cols: [
            Annotation.toRow(item)
            for item in f([Annotation.fromRow(r) for col in cols for r in col])
        ], output_type)
Exemplo n.º 6
0
def map_annotations(f, output_type: DataType):
    return udf(
        lambda content: [
            Annotation.toRow(a)
            for a in f([Annotation.fromRow(r) for r in content])
        ], output_type)