예제 #1
0
    def test_vectorized_udf_struct_type(self):
        df = self.spark.range(10)
        return_type = StructType([
            StructField('id', LongType()),
            StructField('str', StringType())])

        def func(id):
            return pd.DataFrame({'id': id, 'str': id.apply(unicode)})

        f = pandas_udf(func, returnType=return_type)

        expected = df.select(struct(col('id'), col('id').cast('string').alias('str'))
                             .alias('struct')).collect()

        actual = df.select(f(col('id')).alias('struct')).collect()
        self.assertEqual(expected, actual)

        g = pandas_udf(func, 'id: long, str: string')
        actual = df.select(g(col('id')).alias('struct')).collect()
        self.assertEqual(expected, actual)

        struct_f = pandas_udf(lambda x: x, return_type)
        actual = df.select(struct_f(struct(col('id'), col('id').cast('string').alias('str'))))
        if LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
            with QuietTest(self.sc):
                from py4j.protocol import Py4JJavaError
                with self.assertRaisesRegexp(
                        Py4JJavaError,
                        'Unsupported type in conversion from Arrow'):
                    self.assertEqual(expected, actual.collect())
        else:
            self.assertEqual(expected, actual.collect())
예제 #2
0
def test_automapper_filter_and_transform_fluent(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(view="members", source_view="patients").complex(
        MyObject(
            age=A.filter(
                column=A.column("identifier"), func=lambda x: x["use"] == lit("usual")
            ).transform(A.complex(bar=A.field("value"), bar2=A.field("system")))
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        transform(
            filter("b.identifier", lambda x: x["use"] == lit("usual")),
            lambda x: struct(x["value"].alias("bar"), x["system"].alias("bar2")),
        ).alias("age")
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)
예제 #3
0
    def getDirectAccess(scOrder3, principalDS):
        scOrder3RouterInterfaceDS = scOrder3.filter(
            scOrder3.sc_id.isNotNull() & scOrder3.ne_carr.isNotNull())

        auxResource = scOrder3RouterInterfaceDS \
            .filter(scOrder3RouterInterfaceDS.resource.isNotNull()) \
            .drop("port_resource")

        auxPortResource = scOrder3RouterInterfaceDS \
            .filter(scOrder3RouterInterfaceDS.port_resource.isNotNull()) \
            .drop("resource") \
            .withColumnRenamed("port_resource", "resource")

        auxTotal = auxResource \
            .unionByName(auxPortResource) \
            .groupBy("sc_id") \
            .agg(F.collect_list(F.struct("ne_carr", "resource")).alias("router_interface"))

        joinedFastOrder3 = principalDS.join(
            auxTotal, principalDS.service_circuit == auxTotal.sc_id, "inner")

        direct_Order3 = joinedFastOrder3.filter(
            joinedFastOrder3.l3_acc_cfs_type == "Direct Access CFS Instance")
        directOrder3 = FastDsl.routerInterfaceVendorType(direct_Order3)
        directOrder3.cache()

        return directOrder3
예제 #4
0
def melt(self,
         id_vars,
         value_vars,
         var_name="variable",
         value_name="value",
         data_type="str"):
    """
    Convert DataFrame from wide to long format.
    :param self:
    :param id_vars:
    :param value_vars:
    :param var_name: column name for vars
    :param value_name: column name for values
    :param data_type: because all data must have the same type
    :return:
    """

    df = self
    id_vars = val_to_list(id_vars)
    # Cast all colums to the same type
    df = df.cols.cast(id_vars + value_vars, data_type)

    vars_and_vals = [
        F.struct(F.lit(c).alias(var_name),
                 F.col(c).alias(value_name)) for c in value_vars
    ]

    # Add to the DataFrame and explode
    df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals)))

    cols = id_vars + [
        F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name]
    ]

    return df.select(*cols)
예제 #5
0
def expand_array_col_into_seperate_col(colName):
    result = array([
        struct(
            col(colName).getItem(0).alias("first_year"),
            col(colName).getItem(1).alias("sec_year"))
    ])
    return result
예제 #6
0
    def melt(self,
             value_vars: Iterable[str],
             id_vars: Iterable[str] = None,
             var_name: str = "variable",
             value_name: str = "value") -> TDataFrame:
        """

        :param self:
        :param value_vars:
        :param id_vars:
        :param var_name:
        :param value_name:
        :return:

        Convert :class:`DataFrame` from wide to long format.

        """
        id_vars = id_vars if id_vars is not None else []
        # Create array<struct<variable: str, value: ...>>
        variable_name_with_column_values = F.array(
            *(F.struct(F.lit(c).alias(var_name),
                       F.col(c).alias(value_name)) for c in value_vars))

        # Add to the DataFrame and explode
        exploded_vars_and_vals = self.withColumn(
            "variable_name_with_column_values",
            F.explode(variable_name_with_column_values))

        cols = id_vars + [
            F.col("variable_name_with_column_values")[x].alias(x)
            for x in [var_name, value_name]
        ]
        return exploded_vars_and_vals.select(*cols)
예제 #7
0
def _mark_as_lit(data, data_type):
    # To support nested types, 'data_type' is required.
    assert data_type is not None

    if data is None:
        return f.lit(data).cast(data_type)

    if isinstance(data_type, ArrayType):
        assert isinstance(data, list)
        # Sadly you cannot create a literal from just an array in pyspark
        return f.array([_mark_as_lit(x, data_type.elementType) for x in data])
    elif isinstance(data_type, StructType):
        assert isinstance(data, tuple) and len(data) == len(data_type.fields)
        # Sadly you cannot create a literal from just a dict/tuple in pyspark
        children = zip(data, data_type.fields)
        return f.struct([_mark_as_lit(x, fd.dataType).alias(fd.name) for x, fd in children])
    elif isinstance(data_type, DateType):
        # Due to https://bugs.python.org/issue13305 we need to zero pad for years prior to 1000,
        # but this works for all of them
        dateString = data.strftime("%Y-%m-%d").zfill(10)
        return f.lit(dateString).cast(data_type)
    elif isinstance(data_type, MapType):
        assert isinstance(data, dict)
        # Sadly you cannot create a literal from just a dict/tuple in pyspark
        col_array = []
        for k in data:
            col_array.append(_mark_as_lit(k, data_type.keyType))
            col_array.append(_mark_as_lit(data[k], data_type.valueType))
        return f.create_map(*col_array)
    else:
        # lit does not take a data type so we might have to cast it
        return f.lit(data).cast(data_type)
예제 #8
0
def process_demographic_data(spark, input_data, output_data):
    """
    Process the demographic data by dropping the duplicates rows and create a new column 'Major Race' based on a group by function
    
    Parameters:
    spark: the spark session
    input_data: the path of the input folder of the data in the local machine
    output_data: the output folder in S3 Bucket
    """
    
    demo_data = input_data+'us-cities-demographics.csv'
    demo_df = spark.read.format('csv').options(header='true',sep=';').load(demo_data)

    demo_df = demo_df.select('City', 'State','Median Age','Male Population','Female Population','Total Population','Foreign-born','State Code','Race','Count').drop_duplicates(subset=['City', 'State','Race'])
    
    demo_df= demo_df.withColumn("Count",col("Count").cast(IntegerType()))
    
    # Using group by to know the major race of every city
    group_df= demo_df.groupby('City', 'State','Median Age','Male Population','Female Population','Total Population','Foreign-born','State Code').pivot('race').agg(max_('Count'))

    group_df = group_df.na.fill({'Hispanic or Latino':0, 'White':0, 'Asian':0, 'Black or African-American':0, 'American Indian and Alaska Native':0})
    cols = group_df.columns[8:13]
    maxcol = F.udf(lambda row: cols[row.index(max(row))], StringType())  
    group_df = group_df.withColumn("Major Race", maxcol(F.struct([group_df[x] for x in group_df.columns[8:13]])))
 
    group_df.write.option("header","true").csv(output_data+'demographic_data/')
예제 #9
0
def view(df, state_col='_state', updated_col='_updated', hash_col='_hash'):
    """
    Calculate a view from a log of events by performing the following actions:
        - squashing the events for each entry record to the last one
        - remove deleted record from the list
    """

    c = set(df.columns).difference({state_col, updated_col, hash_col})
    colnames = [x for x in df.columns if x in c]

    if updated_col not in df.columns:
        return df

    if state_col not in df.columns:
        return df

    selected_columns = colnames + ['_last.*']
    groupby_columns = colnames

    # groupby hash_col first if available
    if hash_col in df.columns:
        selected_columns = selected_columns + [hash_col]
        groupby_columns = [hash_col] + groupby_columns

    row_groups = df.groupBy(groupby_columns)
    get_sorted_array = F.sort_array(F.collect_list(
        F.struct(F.col(updated_col), F.col(state_col))),
                                    asc=False)
    df_view = row_groups.agg(
        get_sorted_array.getItem(0).alias('_last')).select(*selected_columns)
    df_view = df_view.filter("{} = 0".format(state_col))

    return df_view
예제 #10
0
def pred(var):

    global traffic_df_explicit, spark, schema_for_m

    traffic_for_m = traffic_df_explicit.select(
                     traffic_df_explicit['TID'],
                     traffic_df_explicit['DST'],
                     traffic_df_explicit['TS'].cast(IntegerType()).alias('ds'),
                     traffic_df_explicit[var].alias('y'))\
                   .filter("TID like '%DSO05LM%' and DST like '%01:00:5e:50:01:42%'")\
                   .groupBy('TID', 'DST')\
                   .agg(collect_list(struct('ds', 'y')).alias('data'))\
                   .rdd.map(lambda r: transform_data_m(r))\
                       .map(lambda d: partition_data_m(d))\
                       .filter(lambda d: len(d['train_data']) > 2)\
                       .map(lambda d: create_model_m(d))\
                       .map(lambda d: train_model_m(d))\
                       .map(lambda d: make_forecast_m(d))\
                       .map(lambda d: reduce_data_scope_m(d))\
                       .flatMap(lambda d: expand_predictions_m(d))\

    traffic_for_m.cache()

    df_for_m = spark.createDataFrame(traffic_for_m, schema_for_m)

    #thread

    TH = Thread(target=forecast_from_spark, args=(df_for_m, var))
    TH.start()
예제 #11
0
def melt(df: DataFrame,
         id_vars: Iterable[str],
         value_vars: Iterable[str],
         var_name: str = "variable",
         value_name: str = "value") -> DataFrame:
    """Convert :class:`DataFrame` from wide to long format."""

    # Create array<struct<variable: str, value: ...>>
    # Here each row will have a different row structure with the column name that
    # will be taken
    _vars_and_vals = array(
        *(struct(lit(c).alias(var_name),
                 col(c).alias(value_name)) for c in value_vars))

    # Add to the DataFrame and explode
    # when exploding only columns that are included in the row structure will
    # be included in the datafrmae
    _tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))

    # this one will have all the column names necessary
    cols = id_vars + [
        col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]
    ]

    # when returning select from the previous one
    return _tmp.select(*cols)
예제 #12
0
def test_spark_udf_with_single_arg(spark):
    from pyspark.sql.functions import struct

    class TestModel(PythonModel):
        def predict(self, context, model_input):
            return [",".join(model_input.columns.tolist())] * len(model_input)

    with mlflow.start_run() as run:
        mlflow.pyfunc.log_model("model", python_model=TestModel())

        udf = mlflow.pyfunc.spark_udf(spark,
                                      "runs:/{}/model".format(run.info.run_id),
                                      result_type=StringType())

        data1 = spark.createDataFrame(pd.DataFrame({
            "a": [1],
            "b": [4]
        })).repartition(1)

        result = data1.withColumn("res", udf("a")).select("res").toPandas()
        assert result.res[0] == "0"

        data2 = data1.select(struct("a", "b").alias("ab"))
        result = data2.withColumn("res", udf("ab")).select("res").toPandas()
        assert result.res[0] == "a,b"
예제 #13
0
    def test_nested_higher_order_function(self):
        # SPARK-35382: lambda vars must be resolved properly in nested higher order functions
        from pyspark.sql.functions import flatten, struct, transform

        df = self.spark.sql(
            "SELECT array(1, 2, 3) as numbers, array('a', 'b', 'c') as letters"
        )

        actual = df.select(
            flatten(
                transform(
                    "numbers",
                    lambda number: transform(
                        "letters", lambda letter: struct(
                            number.alias("n"), letter.alias("l"))),
                ))).first()[0]

        expected = [
            (1, "a"),
            (1, "b"),
            (1, "c"),
            (2, "a"),
            (2, "b"),
            (2, "c"),
            (3, "a"),
            (3, "b"),
            (3, "c"),
        ]

        self.assertEquals(actual, expected)
예제 #14
0
    def main(self, sc: SparkContext, *args):
        """
        Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job.
        """
        spark = SparkSession(sc)

        # Parsing app options
        observations_parquet_path = args[0]
        output_path = args[1]

        observations_df = spark.read.parquet(observations_parquet_path)

        adult_lacz_expression_data = get_lacz_expression_data(
            observations_df, "adult")
        embryo_lacz_expression_data = get_lacz_expression_data(
            observations_df, "embryo")

        lacz_expression_data = adult_lacz_expression_data.union(
            embryo_lacz_expression_data)
        lacz_expression_data = lacz_expression_data.withColumn(
            "id", col("gene_accession_id"))
        for col_name in lacz_expression_data.columns:
            lacz_expression_data = lacz_expression_data.withColumnRenamed(
                col_name, to_camel_case(col_name))
        lacz_expression_data = lacz_expression_data.groupBy("id").agg(
            collect_set(
                struct(*[
                    col_name for col_name in lacz_expression_data.columns
                    if col_name != "id"
                ])).alias("expressionData"))
        lacz_expression_data.write.partitionBy("id").json(output_path)
예제 #15
0
 def test_smvExpandStruct(self):
     schema = "id:String;a:Double;b:Double"
     df1 = self.createDF(schema, "a,1.0,10.0;a,2.0,20.0;b,3.0,30.0")
     df2 = df1.select(col("id"), struct("a", "b").alias("c"))
     res = df2.smvExpandStruct("c")
     expect = self.createDF(schema, "a,1.0,10.0;a,2.0,20.0;b,3.0,30.0")
     self.should_be_same(expect, res)
예제 #16
0
def melt(self,
         id_vars,
         value_vars,
         var_name="variable",
         value_name="value",
         data_type="str"):
    """
    Convert DataFrame from wide to long format.
    :param self: Spark Dataframe
    :param id_vars: column with unique values
    :param value_vars: Column names that are going to be converted to columns values
    :param var_name: Column name for vars
    :param value_name: Column name for values
    :param data_type: All columns must have the same type. It will transform all columns to this data type.
    :return:
    """

    df = self
    id_vars = val_to_list(id_vars)
    # Cast all columns to the same type
    df = df.cols.cast(id_vars + value_vars, data_type)

    vars_and_vals = [
        F.struct(F.lit(c).alias(var_name),
                 F.col(c).alias(value_name)) for c in value_vars
    ]

    # Add to the DataFrame and explode
    df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals)))

    cols = id_vars + [
        F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name]
    ]

    return df.select(*cols)
예제 #17
0
def main():
    #process input
    arguments = parse_args()

    #initialize spark
    spark = init(arguments)

    #process input
    df = readInput(arguments, spark)
    if arguments.debug:
        print lineno()
        df.show()

    #Get num conversions per user
    top10ConvertingUsers = extractTopConvertingUsers(df, 10)
    if arguments.debug:
        print top10ConvertingUsers
    writeDataframe('q1_top10ConvertingUsers', top10ConvertingUsers,
                   arguments.printHeader, arguments.partitions)

    #sessionize
    #TODO translate type to numbers so that start session will precede other actions
    windowval = Window.partitionBy('user_id').orderBy(
        'timestamp').rangeBetween(Window.unboundedPreceding, 0)
    dfSessionized = df.withColumn('session_id', fn.sum(fn.when(df["type"] == 'start_session', 1).otherwise(0)).over(windowval))\
    .groupBy('user_id','session_id')\
    .agg(fn.collect_list(fn.struct('type', 'url','timestamp')).alias('path'))
    if arguments.debug:
        print lineno()
        dfSessionized.show(100)

    convertionDistancePerUser = extractMinConversion(dfSessionized)
    if arguments.debug:
        print lineno()
        convertionDistancePerUser.show(100)
    writeDataframe('q2_conversionDistancePerUser', convertionDistancePerUser,
                   arguments.printHeader, arguments.partitions)
    avgConvserionDistance = convertionDistancePerUser.agg(
        fn.avg('conversion_distance').alias('avg_converting_distance'))
    writeDataframe('q3_avgConversionDistance', avgConvserionDistance,
                   arguments.printHeader, arguments.partitions)

    if arguments.poiFiles is not None:
        global pathOfInterest
        for poiPath in arguments.poiFiles.split(','):
            if arguments.debug:
                print 'Processing path ', poiPath
            for filePath in glob.glob(poiPath):
                if arguments.debug:
                    print 'Processing file ', filePath
                pathOfInterest = readFileToList(filePath)
                #Get users matching path of urls
                patternMatchingUsers = extractUsersMatchingPath(dfSessionized)
                if arguments.debug:
                    print lineno()
                    patternMatchingUsers.show()
                writeDataframe('q4_patternMatchingUsers/' + filePath,
                               patternMatchingUsers, arguments.printHeader,
                               arguments.partitions)
def process_toxcast(toxcast: str) -> DataFrame:
    """
    Loads and processes the ToxCast input table.

    Ex. input record:
        assay_component_endpoint_name | ACEA_ER_80hr
        assay_component_desc          | ACEA_ER_80hr, is ...
        biological_process_target     | cell proliferation
        tissue                        | null
        cell_format                   | cell line
        cell_short_name               | T47D
        assay_format_type             | cell-based
        official_symbol               | ESR1
        eventId                       | null

    Ex. output record:
     targetFromSourceId | ESR1
    event              | cell proliferation
    eventId            | null
    biosample          | {null, null, T47D...
    datasource         | ToxCast
    url                | https://www.epa.g...
    study              | {ACEA_ER_80hr, AC...
    """

    return spark.read.csv(toxcast, sep='\t', header=True).select(
        F.trim(F.col('official_symbol')).alias('targetFromSourceId'),
        F.col('biological_process_target').alias('event'),
        'eventId',
        F.struct(
            F.col('tissue').alias('tissueLabel'),
            F.lit(None).alias('tissueId'),
            F.col('cell_short_name').alias('cellLabel'),
            F.col('cell_format').alias('cellFormat'),
            F.lit(None).alias('cellId'),
        ).alias('biosample'),
        F.lit('ToxCast').alias('datasource'),
        F.lit(
            'https://www.epa.gov/chemical-research/exploring-toxcast-data-downloadable-data'
        ).alias('url'),
        F.struct(
            F.col('assay_component_endpoint_name').alias('name'),
            F.col('assay_component_desc').alias('description'),
            F.col('assay_format_type').alias('type'),
        ).alias('study'),
    )
예제 #19
0
def langCountQuery(df, colName):
    return df \
        .withWatermark("timestamp", "2 minutes") \
        .groupBy(
            window(col("timestamp"), "2 minutes", "1 minutes"),
            col(colName)
        ).count() \
        .select(colName, "count", to_json(struct(colName, "count")).alias("value"))
예제 #20
0
def ndcg(df,
         k,
         label_col='label',
         position_col='hit_position',
         query_cols=['wikiid', 'query', 'session_id']):
    """
    Calculate ndcg@k for the provided dataframe

    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Input dataframe to calculate against
    k : int
        Cutoff for ndcg calculation
    label_col : str
        Column name containing integer label, higher is better, of the hit
    position_col : str
        Column name containing order displayed to user, lowest first, of the hit
    query_cols : list of str
        Column names to group by, which indicate a unique query displayed to a user

    Returns
    -------
    float
        The ndcg@k value, always between 0 and 1
    """
    # ideal results per labels
    w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc())
    topAtK = (df.select(label_col, *query_cols).withColumn(
        'rn',
        F.row_number().over(w)).where(F.col('rn') <= k).groupBy(
            *query_cols).agg(
                F.collect_list(F.struct(label_col, 'rn')).alias('topAtK')))
    # top k results shown to user
    w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc())
    predictedTopAtK = (df.select(
        label_col, position_col, *query_cols).withColumn(
            'rn',
            F.row_number().over(w)).where(F.col('rn') <= k).groupBy(
                *query_cols).agg(
                    F.collect_list(F.struct(label_col,
                                            'rn')).alias('predictedTopAtK')))
    return (topAtK.join(predictedTopAtK, query_cols, how='inner').select(
        _ndcg_at(k, label_col)(
            'predictedTopAtK', 'topAtK').alias('ndcgAtK')).select(
                F.mean('ndcgAtK').alias('ndcgAtK')).collect()[0].ndcgAtK)
예제 #21
0
def wordCountQuery(df, colName):
    return df \
        .withWatermark("timestamp", "10 seconds") \
        .withColumn('word', explode(split(col(colName), ' '))) \
        .groupBy(window(col("timestamp"), "10 seconds", "5 seconds"),
                 col('word')
                 ).count() \
        .select("word", "count", to_json(struct("word", "count")).alias("value"))
예제 #22
0
def stats(col: str) -> F.Column:
    return F.struct(
        F.min(col).alias('min'),
        F.max(col).alias('max'),
        F.avg(col).alias('avg'),
        F.count(col).alias('count'),
        F.countDistinct(col).alias('countDistinct'),
    )
def test_auto_mapper_struct_with_mappers(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(dst2=A.complex(use="usual", family=A.struct({"given": "foo"})))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert_compare_expressions(
        sql_expressions["dst2"],
        struct(
            expr("usual").alias("use"),
            struct(expr("foo").alias("given")).alias("family"),
        ).alias("dst2"),
    )

    result_df.printSchema()
    result_df.show()

    result = result_df.where("member_id == 1").select("dst2").collect()[0][0]
    assert result[0] == "usual"
    assert result[1][0] == "foo"
예제 #24
0
 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     return struct(*[
         self.get_value(value=value,
                        source_df=source_df,
                        current_column=current_column).alias(key)
         for key, value in self.value.items()
     ])
예제 #25
0
 def __init__(self, kdf: DataFrame, scol: Optional[spark.Column] = None):
     assert len(kdf._internal._index_map) > 1
     self._kdf = kdf
     if scol is None:
         IndexOpsMixin.__init__(self, kdf._internal.copy(
             scol=F.struct(self._kdf._internal.index_scols)), kdf)
     else:
         IndexOpsMixin.__init__(self, kdf._internal.copy(scol=scol), kdf)
예제 #26
0
def _get_a2b(edges):
    """
    Processes the `edges` DataFrame and returns `a2b` DataFrame
        ((a)-[e]->(b)) to be used for each iteration of BFS.

    :param edges: edges of the graph (contains two special
        columns named "src" and "dst" which specifies an
        edge from vertex "src" to vertex "dst") with the
        following schema:
            | |-- src: str
            | |-- dst: str
            | |-- relationship: str
            | |-- Type: str
            | |-- Source_Type: str
            | |-- Target_Type: str
    :type edges: pyspark.sql.DataFrame
    :return: contains three special columns named "a" (src),
        "e" (edge), "b" (dst) with the following schema:
            | |-- a: pyspark.sql.StructType
            | |-- -- id: str
            | |-- -- Category: str
            | |-- e: pyspark.sql.StructType
            | |-- -- src: str
            | |-- -- dst: str
            | |-- -- relationship: str
            | |-- -- Type: str
            | |-- -- Source_Type: str
            | |-- -- Target_Type: str
            | |-- b: pyspark.sql.StructType
            | |-- -- id: str
            | |-- -- Category: str
    :rtype: pyspark.sql.DataFrame
    """
    edges_column_names = [col(column_name) for column_name in edges.columns]
    a2b = (edges.withColumn('e', struct(*edges_column_names)).select('e'))
    a2b = (a2b.withColumn(
        'a',
        struct(
            col('e.src').alias('id'),
            col('e.Source_Type').alias('Category'))).withColumn(
                'b',
                struct(
                    col('e.dst').alias('id'),
                    col('e.Target_Type').alias('Category'))))

    return a2b
예제 #27
0
 def __init__(self, kdf: DataFrame):
     assert len(kdf._internal._index_map) > 1
     scol = F.struct(kdf._internal.index_scols)
     data_columns = kdf._sdf.select(scol).columns
     internal = kdf._internal.copy(scol=scol,
                                   column_index=[(col, None) for col in data_columns],
                                   column_index_names=None)
     IndexOpsMixin.__init__(self, internal, kdf)
예제 #28
0
def cast_nested_col(df: pyspark.sql.DataFrame,
                    col_name: str,
                    col_type: str,
                    alias: str = None,
                    date_format: str = None) -> pyspark.sql.DataFrame:

    if alias:
        # get columns names
        alias_columns = [
            F.col(f"{alias}.{col.name}") for col in df.schema[alias].dataType
            if col.name != col_name
        ]

        if col_type == 'timestamp':
            return df.withColumn(
                alias,
                F.struct([
                    *alias_columns,
                    F.to_timestamp(F.col(f"{alias}.{col_name}"),
                                   date_format).alias(col_name)
                ]))
        elif col_type == 'date':
            return df.withColumn(
                alias,
                F.struct([
                    *alias_columns,
                    F.to_date(F.col(f"{alias}.{col_name}"),
                              date_format).alias(col_name)
                ]))
        else:
            return df.withColumn(
                alias,
                F.struct([
                    *alias_columns,
                    F.col(f"{alias}.{col_name}").cast(col_type).alias(col_name)
                ]))
    else:
        if col_type == 'timestamp':
            return df.withColumn(col_name,
                                 F.to_timestamp(F.col(col_name), date_format))
        elif col_type == 'date':
            return df.withColumn(col_name,
                                 F.to_date(F.col(col_name), date_format))
        else:
            return df.withColumn(col_name, F.col(col_name).cast(col_type))
예제 #29
0
 def _internal(self) -> InternalFrame:
     internal = self._psdf._internal
     scol = F.struct(*internal.index_spark_columns)
     return internal.copy(
         column_labels=[None],
         data_spark_columns=[scol],
         data_fields=[None],
         column_label_names=None,
     )
예제 #30
0
    def get_word_vec(self):
        data = self.merge_df.groupBy('user_id').agg(
            func.sort_array(func.collect_list(func.struct(func.col('time'), func.col('ad_id'))), asc=True).alias(
                'items'))
        data = data.withColumn("items", func.udf(lambda x: [i[1] for i in x], ArrayType(StringType()))('items'))

        word2Vec = Word2Vec(vectorSize=128, minCount=10, inputCol="items", outputCol="result")
        model = word2Vec.fit(data.repartition(1000))
        return model
예제 #31
0
    def to_json_df(self, file_stream_df):
        """Converts the DataFrame stream to a JSON format.

        Args:
            file_stream_df (DataFrame): The DataFrame.

            Returns: A dataframe which holds the data in a JSON format.
        """
        return file_stream_df.select(to_json(struct([file_stream_df[x] for x in file_stream_df.columns])).alias("value"))
예제 #32
0
파일: multi.py 프로젝트: aaalan321/spark
 def _internal(self):
     internal = self._kdf._internal
     scol = F.struct(internal.index_spark_columns)
     return internal.copy(
         column_labels=[None],
         data_spark_columns=[scol],
         data_dtypes=[None],
         column_label_names=None,
     )
예제 #33
0
    def test_vectorized_udf_struct_type(self):
        df = self.spark.range(10)
        return_type = StructType([
            StructField('id', LongType()),
            StructField('str', StringType())])

        def func(id):
            return pd.DataFrame({'id': id, 'str': id.apply(unicode)})

        f = pandas_udf(func, returnType=return_type)

        expected = df.select(struct(col('id'), col('id').cast('string').alias('str'))
                             .alias('struct')).collect()

        actual = df.select(f(col('id')).alias('struct')).collect()
        self.assertEqual(expected, actual)

        g = pandas_udf(func, 'id: long, str: string')
        actual = df.select(g(col('id')).alias('struct')).collect()
        self.assertEqual(expected, actual)

        struct_f = pandas_udf(lambda x: x, return_type)
        actual = df.select(struct_f(struct(col('id'), col('id').cast('string').alias('str'))))
        self.assertEqual(expected, actual.collect())
예제 #34
0
    def test_vectorized_udf_chained_struct_type(self):
        import pandas as pd

        df = self.spark.range(10)
        return_type = StructType([
            StructField('id', LongType()),
            StructField('str', StringType())])

        @pandas_udf(return_type)
        def f(id):
            return pd.DataFrame({'id': id, 'str': id.apply(unicode)})

        g = pandas_udf(lambda x: x, return_type)

        expected = df.select(struct(col('id'), col('id').cast('string').alias('str'))
                             .alias('struct')).collect()

        actual = df.select(g(f(col('id'))).alias('struct')).collect()
        self.assertEqual(expected, actual)
# COMMAND ----------

freq = df.stat.freqItems(["a", "b", "c"], 0.4)
freq.collect()[0]

# COMMAND ----------

# MAGIC %md Per above `{a = 1}, {b = 2}, {c = 1, 3}` are frequent items, note that `{a = 65}` and `{b = 130}` are false positives.
# MAGIC 
# MAGIC You can also find frequent items for column combinations, by creating a composite column using the struct function:

# COMMAND ----------

from pyspark.sql.functions import struct
freq = df.withColumn('ab', struct('a', 'b')).stat.freqItems(['ab'], 0.4)
freq.collect()[0]

# COMMAND ----------

# MAGIC %md From the above example, the combination of `a=99 and b=198`, and `a=1 and b=2` appear frequently in this dataset. Note that `a=99 and b=198` is a false positive.

# COMMAND ----------

# MAGIC %md ### Mathematical Functions
# MAGIC Spark 1.4 also added a suite of mathematical functions. Users can apply these to their columns with ease. The list of math functions that are supported come from [this file](https://github.com/apache/spark/blob/efe3bfdf496aa6206ace2697e31dd4c0c3c824fb/python/pyspark/sql/functions.py#L109). The inputs need to be columns functions that take a single argument, such as `cos, sin, floor, ceil`. For functions that take two arguments as input, such as `pow, hypot`, either two columns or a combination of a double and column can be supplied.

# COMMAND ----------

from pyspark.sql.functions import *
df = sqlContext.range(0, 10).withColumn('uniform', rand(seed=10) * 3.14)
# COMMAND ----------

fill_cols_vals = {"StockCode": 5, "Description" : "No Value"}
df.na.fill(fill_cols_vals)


# COMMAND ----------

df.na.replace([""], ["UNKNOWN"], "Description")


# COMMAND ----------

from pyspark.sql.functions import struct
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")


# COMMAND ----------

from pyspark.sql.functions import split
df.select(split(col("Description"), " ")).show(2)


# COMMAND ----------

df.select(split(col("Description"), " ").alias("array_col"))\
  .selectExpr("array_col[0]").show(2)