def test_vectorized_udf_struct_type(self): df = self.spark.range(10) return_type = StructType([ StructField('id', LongType()), StructField('str', StringType())]) def func(id): return pd.DataFrame({'id': id, 'str': id.apply(unicode)}) f = pandas_udf(func, returnType=return_type) expected = df.select(struct(col('id'), col('id').cast('string').alias('str')) .alias('struct')).collect() actual = df.select(f(col('id')).alias('struct')).collect() self.assertEqual(expected, actual) g = pandas_udf(func, 'id: long, str: string') actual = df.select(g(col('id')).alias('struct')).collect() self.assertEqual(expected, actual) struct_f = pandas_udf(lambda x: x, return_type) actual = df.select(struct_f(struct(col('id'), col('id').cast('string').alias('str')))) if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): with QuietTest(self.sc): from py4j.protocol import Py4JJavaError with self.assertRaisesRegexp( Py4JJavaError, 'Unsupported type in conversion from Arrow'): self.assertEqual(expected, actual.collect()) else: self.assertEqual(expected, actual.collect())
def test_automapper_filter_and_transform_fluent(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper(view="members", source_view="patients").complex( MyObject( age=A.filter( column=A.column("identifier"), func=lambda x: x["use"] == lit("usual") ).transform(A.complex(bar=A.field("value"), bar2=A.field("system"))) ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( transform( filter("b.identifier", lambda x: x["use"] == lit("usual")), lambda x: struct(x["value"].alias("bar"), x["system"].alias("bar2")), ).alias("age") ) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False)
def getDirectAccess(scOrder3, principalDS): scOrder3RouterInterfaceDS = scOrder3.filter( scOrder3.sc_id.isNotNull() & scOrder3.ne_carr.isNotNull()) auxResource = scOrder3RouterInterfaceDS \ .filter(scOrder3RouterInterfaceDS.resource.isNotNull()) \ .drop("port_resource") auxPortResource = scOrder3RouterInterfaceDS \ .filter(scOrder3RouterInterfaceDS.port_resource.isNotNull()) \ .drop("resource") \ .withColumnRenamed("port_resource", "resource") auxTotal = auxResource \ .unionByName(auxPortResource) \ .groupBy("sc_id") \ .agg(F.collect_list(F.struct("ne_carr", "resource")).alias("router_interface")) joinedFastOrder3 = principalDS.join( auxTotal, principalDS.service_circuit == auxTotal.sc_id, "inner") direct_Order3 = joinedFastOrder3.filter( joinedFastOrder3.l3_acc_cfs_type == "Direct Access CFS Instance") directOrder3 = FastDsl.routerInterfaceVendorType(direct_Order3) directOrder3.cache() return directOrder3
def melt(self, id_vars, value_vars, var_name="variable", value_name="value", data_type="str"): """ Convert DataFrame from wide to long format. :param self: :param id_vars: :param value_vars: :param var_name: column name for vars :param value_name: column name for values :param data_type: because all data must have the same type :return: """ df = self id_vars = val_to_list(id_vars) # Cast all colums to the same type df = df.cols.cast(id_vars + value_vars, data_type) vars_and_vals = [ F.struct(F.lit(c).alias(var_name), F.col(c).alias(value_name)) for c in value_vars ] # Add to the DataFrame and explode df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals))) cols = id_vars + [ F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name] ] return df.select(*cols)
def expand_array_col_into_seperate_col(colName): result = array([ struct( col(colName).getItem(0).alias("first_year"), col(colName).getItem(1).alias("sec_year")) ]) return result
def melt(self, value_vars: Iterable[str], id_vars: Iterable[str] = None, var_name: str = "variable", value_name: str = "value") -> TDataFrame: """ :param self: :param value_vars: :param id_vars: :param var_name: :param value_name: :return: Convert :class:`DataFrame` from wide to long format. """ id_vars = id_vars if id_vars is not None else [] # Create array<struct<variable: str, value: ...>> variable_name_with_column_values = F.array( *(F.struct(F.lit(c).alias(var_name), F.col(c).alias(value_name)) for c in value_vars)) # Add to the DataFrame and explode exploded_vars_and_vals = self.withColumn( "variable_name_with_column_values", F.explode(variable_name_with_column_values)) cols = id_vars + [ F.col("variable_name_with_column_values")[x].alias(x) for x in [var_name, value_name] ] return exploded_vars_and_vals.select(*cols)
def _mark_as_lit(data, data_type): # To support nested types, 'data_type' is required. assert data_type is not None if data is None: return f.lit(data).cast(data_type) if isinstance(data_type, ArrayType): assert isinstance(data, list) # Sadly you cannot create a literal from just an array in pyspark return f.array([_mark_as_lit(x, data_type.elementType) for x in data]) elif isinstance(data_type, StructType): assert isinstance(data, tuple) and len(data) == len(data_type.fields) # Sadly you cannot create a literal from just a dict/tuple in pyspark children = zip(data, data_type.fields) return f.struct([_mark_as_lit(x, fd.dataType).alias(fd.name) for x, fd in children]) elif isinstance(data_type, DateType): # Due to https://bugs.python.org/issue13305 we need to zero pad for years prior to 1000, # but this works for all of them dateString = data.strftime("%Y-%m-%d").zfill(10) return f.lit(dateString).cast(data_type) elif isinstance(data_type, MapType): assert isinstance(data, dict) # Sadly you cannot create a literal from just a dict/tuple in pyspark col_array = [] for k in data: col_array.append(_mark_as_lit(k, data_type.keyType)) col_array.append(_mark_as_lit(data[k], data_type.valueType)) return f.create_map(*col_array) else: # lit does not take a data type so we might have to cast it return f.lit(data).cast(data_type)
def process_demographic_data(spark, input_data, output_data): """ Process the demographic data by dropping the duplicates rows and create a new column 'Major Race' based on a group by function Parameters: spark: the spark session input_data: the path of the input folder of the data in the local machine output_data: the output folder in S3 Bucket """ demo_data = input_data+'us-cities-demographics.csv' demo_df = spark.read.format('csv').options(header='true',sep=';').load(demo_data) demo_df = demo_df.select('City', 'State','Median Age','Male Population','Female Population','Total Population','Foreign-born','State Code','Race','Count').drop_duplicates(subset=['City', 'State','Race']) demo_df= demo_df.withColumn("Count",col("Count").cast(IntegerType())) # Using group by to know the major race of every city group_df= demo_df.groupby('City', 'State','Median Age','Male Population','Female Population','Total Population','Foreign-born','State Code').pivot('race').agg(max_('Count')) group_df = group_df.na.fill({'Hispanic or Latino':0, 'White':0, 'Asian':0, 'Black or African-American':0, 'American Indian and Alaska Native':0}) cols = group_df.columns[8:13] maxcol = F.udf(lambda row: cols[row.index(max(row))], StringType()) group_df = group_df.withColumn("Major Race", maxcol(F.struct([group_df[x] for x in group_df.columns[8:13]]))) group_df.write.option("header","true").csv(output_data+'demographic_data/')
def view(df, state_col='_state', updated_col='_updated', hash_col='_hash'): """ Calculate a view from a log of events by performing the following actions: - squashing the events for each entry record to the last one - remove deleted record from the list """ c = set(df.columns).difference({state_col, updated_col, hash_col}) colnames = [x for x in df.columns if x in c] if updated_col not in df.columns: return df if state_col not in df.columns: return df selected_columns = colnames + ['_last.*'] groupby_columns = colnames # groupby hash_col first if available if hash_col in df.columns: selected_columns = selected_columns + [hash_col] groupby_columns = [hash_col] + groupby_columns row_groups = df.groupBy(groupby_columns) get_sorted_array = F.sort_array(F.collect_list( F.struct(F.col(updated_col), F.col(state_col))), asc=False) df_view = row_groups.agg( get_sorted_array.getItem(0).alias('_last')).select(*selected_columns) df_view = df_view.filter("{} = 0".format(state_col)) return df_view
def pred(var): global traffic_df_explicit, spark, schema_for_m traffic_for_m = traffic_df_explicit.select( traffic_df_explicit['TID'], traffic_df_explicit['DST'], traffic_df_explicit['TS'].cast(IntegerType()).alias('ds'), traffic_df_explicit[var].alias('y'))\ .filter("TID like '%DSO05LM%' and DST like '%01:00:5e:50:01:42%'")\ .groupBy('TID', 'DST')\ .agg(collect_list(struct('ds', 'y')).alias('data'))\ .rdd.map(lambda r: transform_data_m(r))\ .map(lambda d: partition_data_m(d))\ .filter(lambda d: len(d['train_data']) > 2)\ .map(lambda d: create_model_m(d))\ .map(lambda d: train_model_m(d))\ .map(lambda d: make_forecast_m(d))\ .map(lambda d: reduce_data_scope_m(d))\ .flatMap(lambda d: expand_predictions_m(d))\ traffic_for_m.cache() df_for_m = spark.createDataFrame(traffic_for_m, schema_for_m) #thread TH = Thread(target=forecast_from_spark, args=(df_for_m, var)) TH.start()
def melt(df: DataFrame, id_vars: Iterable[str], value_vars: Iterable[str], var_name: str = "variable", value_name: str = "value") -> DataFrame: """Convert :class:`DataFrame` from wide to long format.""" # Create array<struct<variable: str, value: ...>> # Here each row will have a different row structure with the column name that # will be taken _vars_and_vals = array( *(struct(lit(c).alias(var_name), col(c).alias(value_name)) for c in value_vars)) # Add to the DataFrame and explode # when exploding only columns that are included in the row structure will # be included in the datafrmae _tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals)) # this one will have all the column names necessary cols = id_vars + [ col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name] ] # when returning select from the previous one return _tmp.select(*cols)
def test_spark_udf_with_single_arg(spark): from pyspark.sql.functions import struct class TestModel(PythonModel): def predict(self, context, model_input): return [",".join(model_input.columns.tolist())] * len(model_input) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel()) udf = mlflow.pyfunc.spark_udf(spark, "runs:/{}/model".format(run.info.run_id), result_type=StringType()) data1 = spark.createDataFrame(pd.DataFrame({ "a": [1], "b": [4] })).repartition(1) result = data1.withColumn("res", udf("a")).select("res").toPandas() assert result.res[0] == "0" data2 = data1.select(struct("a", "b").alias("ab")) result = data2.withColumn("res", udf("ab")).select("res").toPandas() assert result.res[0] == "a,b"
def test_nested_higher_order_function(self): # SPARK-35382: lambda vars must be resolved properly in nested higher order functions from pyspark.sql.functions import flatten, struct, transform df = self.spark.sql( "SELECT array(1, 2, 3) as numbers, array('a', 'b', 'c') as letters" ) actual = df.select( flatten( transform( "numbers", lambda number: transform( "letters", lambda letter: struct( number.alias("n"), letter.alias("l"))), ))).first()[0] expected = [ (1, "a"), (1, "b"), (1, "c"), (2, "a"), (2, "b"), (2, "c"), (3, "a"), (3, "b"), (3, "c"), ] self.assertEquals(actual, expected)
def main(self, sc: SparkContext, *args): """ Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job. """ spark = SparkSession(sc) # Parsing app options observations_parquet_path = args[0] output_path = args[1] observations_df = spark.read.parquet(observations_parquet_path) adult_lacz_expression_data = get_lacz_expression_data( observations_df, "adult") embryo_lacz_expression_data = get_lacz_expression_data( observations_df, "embryo") lacz_expression_data = adult_lacz_expression_data.union( embryo_lacz_expression_data) lacz_expression_data = lacz_expression_data.withColumn( "id", col("gene_accession_id")) for col_name in lacz_expression_data.columns: lacz_expression_data = lacz_expression_data.withColumnRenamed( col_name, to_camel_case(col_name)) lacz_expression_data = lacz_expression_data.groupBy("id").agg( collect_set( struct(*[ col_name for col_name in lacz_expression_data.columns if col_name != "id" ])).alias("expressionData")) lacz_expression_data.write.partitionBy("id").json(output_path)
def test_smvExpandStruct(self): schema = "id:String;a:Double;b:Double" df1 = self.createDF(schema, "a,1.0,10.0;a,2.0,20.0;b,3.0,30.0") df2 = df1.select(col("id"), struct("a", "b").alias("c")) res = df2.smvExpandStruct("c") expect = self.createDF(schema, "a,1.0,10.0;a,2.0,20.0;b,3.0,30.0") self.should_be_same(expect, res)
def melt(self, id_vars, value_vars, var_name="variable", value_name="value", data_type="str"): """ Convert DataFrame from wide to long format. :param self: Spark Dataframe :param id_vars: column with unique values :param value_vars: Column names that are going to be converted to columns values :param var_name: Column name for vars :param value_name: Column name for values :param data_type: All columns must have the same type. It will transform all columns to this data type. :return: """ df = self id_vars = val_to_list(id_vars) # Cast all columns to the same type df = df.cols.cast(id_vars + value_vars, data_type) vars_and_vals = [ F.struct(F.lit(c).alias(var_name), F.col(c).alias(value_name)) for c in value_vars ] # Add to the DataFrame and explode df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals))) cols = id_vars + [ F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name] ] return df.select(*cols)
def main(): #process input arguments = parse_args() #initialize spark spark = init(arguments) #process input df = readInput(arguments, spark) if arguments.debug: print lineno() df.show() #Get num conversions per user top10ConvertingUsers = extractTopConvertingUsers(df, 10) if arguments.debug: print top10ConvertingUsers writeDataframe('q1_top10ConvertingUsers', top10ConvertingUsers, arguments.printHeader, arguments.partitions) #sessionize #TODO translate type to numbers so that start session will precede other actions windowval = Window.partitionBy('user_id').orderBy( 'timestamp').rangeBetween(Window.unboundedPreceding, 0) dfSessionized = df.withColumn('session_id', fn.sum(fn.when(df["type"] == 'start_session', 1).otherwise(0)).over(windowval))\ .groupBy('user_id','session_id')\ .agg(fn.collect_list(fn.struct('type', 'url','timestamp')).alias('path')) if arguments.debug: print lineno() dfSessionized.show(100) convertionDistancePerUser = extractMinConversion(dfSessionized) if arguments.debug: print lineno() convertionDistancePerUser.show(100) writeDataframe('q2_conversionDistancePerUser', convertionDistancePerUser, arguments.printHeader, arguments.partitions) avgConvserionDistance = convertionDistancePerUser.agg( fn.avg('conversion_distance').alias('avg_converting_distance')) writeDataframe('q3_avgConversionDistance', avgConvserionDistance, arguments.printHeader, arguments.partitions) if arguments.poiFiles is not None: global pathOfInterest for poiPath in arguments.poiFiles.split(','): if arguments.debug: print 'Processing path ', poiPath for filePath in glob.glob(poiPath): if arguments.debug: print 'Processing file ', filePath pathOfInterest = readFileToList(filePath) #Get users matching path of urls patternMatchingUsers = extractUsersMatchingPath(dfSessionized) if arguments.debug: print lineno() patternMatchingUsers.show() writeDataframe('q4_patternMatchingUsers/' + filePath, patternMatchingUsers, arguments.printHeader, arguments.partitions)
def process_toxcast(toxcast: str) -> DataFrame: """ Loads and processes the ToxCast input table. Ex. input record: assay_component_endpoint_name | ACEA_ER_80hr assay_component_desc | ACEA_ER_80hr, is ... biological_process_target | cell proliferation tissue | null cell_format | cell line cell_short_name | T47D assay_format_type | cell-based official_symbol | ESR1 eventId | null Ex. output record: targetFromSourceId | ESR1 event | cell proliferation eventId | null biosample | {null, null, T47D... datasource | ToxCast url | https://www.epa.g... study | {ACEA_ER_80hr, AC... """ return spark.read.csv(toxcast, sep='\t', header=True).select( F.trim(F.col('official_symbol')).alias('targetFromSourceId'), F.col('biological_process_target').alias('event'), 'eventId', F.struct( F.col('tissue').alias('tissueLabel'), F.lit(None).alias('tissueId'), F.col('cell_short_name').alias('cellLabel'), F.col('cell_format').alias('cellFormat'), F.lit(None).alias('cellId'), ).alias('biosample'), F.lit('ToxCast').alias('datasource'), F.lit( 'https://www.epa.gov/chemical-research/exploring-toxcast-data-downloadable-data' ).alias('url'), F.struct( F.col('assay_component_endpoint_name').alias('name'), F.col('assay_component_desc').alias('description'), F.col('assay_format_type').alias('type'), ).alias('study'), )
def langCountQuery(df, colName): return df \ .withWatermark("timestamp", "2 minutes") \ .groupBy( window(col("timestamp"), "2 minutes", "1 minutes"), col(colName) ).count() \ .select(colName, "count", to_json(struct(colName, "count")).alias("value"))
def ndcg(df, k, label_col='label', position_col='hit_position', query_cols=['wikiid', 'query', 'session_id']): """ Calculate ndcg@k for the provided dataframe Parameters ---------- df : pyspark.sql.DataFrame Input dataframe to calculate against k : int Cutoff for ndcg calculation label_col : str Column name containing integer label, higher is better, of the hit position_col : str Column name containing order displayed to user, lowest first, of the hit query_cols : list of str Column names to group by, which indicate a unique query displayed to a user Returns ------- float The ndcg@k value, always between 0 and 1 """ # ideal results per labels w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc()) topAtK = (df.select(label_col, *query_cols).withColumn( 'rn', F.row_number().over(w)).where(F.col('rn') <= k).groupBy( *query_cols).agg( F.collect_list(F.struct(label_col, 'rn')).alias('topAtK'))) # top k results shown to user w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc()) predictedTopAtK = (df.select( label_col, position_col, *query_cols).withColumn( 'rn', F.row_number().over(w)).where(F.col('rn') <= k).groupBy( *query_cols).agg( F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK'))) return (topAtK.join(predictedTopAtK, query_cols, how='inner').select( _ndcg_at(k, label_col)( 'predictedTopAtK', 'topAtK').alias('ndcgAtK')).select( F.mean('ndcgAtK').alias('ndcgAtK')).collect()[0].ndcgAtK)
def wordCountQuery(df, colName): return df \ .withWatermark("timestamp", "10 seconds") \ .withColumn('word', explode(split(col(colName), ' '))) \ .groupBy(window(col("timestamp"), "10 seconds", "5 seconds"), col('word') ).count() \ .select("word", "count", to_json(struct("word", "count")).alias("value"))
def stats(col: str) -> F.Column: return F.struct( F.min(col).alias('min'), F.max(col).alias('max'), F.avg(col).alias('avg'), F.count(col).alias('count'), F.countDistinct(col).alias('countDistinct'), )
def test_auto_mapper_struct_with_mappers(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(dst2=A.complex(use="usual", family=A.struct({"given": "foo"}))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert_compare_expressions( sql_expressions["dst2"], struct( expr("usual").alias("use"), struct(expr("foo").alias("given")).alias("family"), ).alias("dst2"), ) result_df.printSchema() result_df.show() result = result_df.where("member_id == 1").select("dst2").collect()[0][0] assert result[0] == "usual" assert result[1][0] == "foo"
def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: return struct(*[ self.get_value(value=value, source_df=source_df, current_column=current_column).alias(key) for key, value in self.value.items() ])
def __init__(self, kdf: DataFrame, scol: Optional[spark.Column] = None): assert len(kdf._internal._index_map) > 1 self._kdf = kdf if scol is None: IndexOpsMixin.__init__(self, kdf._internal.copy( scol=F.struct(self._kdf._internal.index_scols)), kdf) else: IndexOpsMixin.__init__(self, kdf._internal.copy(scol=scol), kdf)
def _get_a2b(edges): """ Processes the `edges` DataFrame and returns `a2b` DataFrame ((a)-[e]->(b)) to be used for each iteration of BFS. :param edges: edges of the graph (contains two special columns named "src" and "dst" which specifies an edge from vertex "src" to vertex "dst") with the following schema: | |-- src: str | |-- dst: str | |-- relationship: str | |-- Type: str | |-- Source_Type: str | |-- Target_Type: str :type edges: pyspark.sql.DataFrame :return: contains three special columns named "a" (src), "e" (edge), "b" (dst) with the following schema: | |-- a: pyspark.sql.StructType | |-- -- id: str | |-- -- Category: str | |-- e: pyspark.sql.StructType | |-- -- src: str | |-- -- dst: str | |-- -- relationship: str | |-- -- Type: str | |-- -- Source_Type: str | |-- -- Target_Type: str | |-- b: pyspark.sql.StructType | |-- -- id: str | |-- -- Category: str :rtype: pyspark.sql.DataFrame """ edges_column_names = [col(column_name) for column_name in edges.columns] a2b = (edges.withColumn('e', struct(*edges_column_names)).select('e')) a2b = (a2b.withColumn( 'a', struct( col('e.src').alias('id'), col('e.Source_Type').alias('Category'))).withColumn( 'b', struct( col('e.dst').alias('id'), col('e.Target_Type').alias('Category')))) return a2b
def __init__(self, kdf: DataFrame): assert len(kdf._internal._index_map) > 1 scol = F.struct(kdf._internal.index_scols) data_columns = kdf._sdf.select(scol).columns internal = kdf._internal.copy(scol=scol, column_index=[(col, None) for col in data_columns], column_index_names=None) IndexOpsMixin.__init__(self, internal, kdf)
def cast_nested_col(df: pyspark.sql.DataFrame, col_name: str, col_type: str, alias: str = None, date_format: str = None) -> pyspark.sql.DataFrame: if alias: # get columns names alias_columns = [ F.col(f"{alias}.{col.name}") for col in df.schema[alias].dataType if col.name != col_name ] if col_type == 'timestamp': return df.withColumn( alias, F.struct([ *alias_columns, F.to_timestamp(F.col(f"{alias}.{col_name}"), date_format).alias(col_name) ])) elif col_type == 'date': return df.withColumn( alias, F.struct([ *alias_columns, F.to_date(F.col(f"{alias}.{col_name}"), date_format).alias(col_name) ])) else: return df.withColumn( alias, F.struct([ *alias_columns, F.col(f"{alias}.{col_name}").cast(col_type).alias(col_name) ])) else: if col_type == 'timestamp': return df.withColumn(col_name, F.to_timestamp(F.col(col_name), date_format)) elif col_type == 'date': return df.withColumn(col_name, F.to_date(F.col(col_name), date_format)) else: return df.withColumn(col_name, F.col(col_name).cast(col_type))
def _internal(self) -> InternalFrame: internal = self._psdf._internal scol = F.struct(*internal.index_spark_columns) return internal.copy( column_labels=[None], data_spark_columns=[scol], data_fields=[None], column_label_names=None, )
def get_word_vec(self): data = self.merge_df.groupBy('user_id').agg( func.sort_array(func.collect_list(func.struct(func.col('time'), func.col('ad_id'))), asc=True).alias( 'items')) data = data.withColumn("items", func.udf(lambda x: [i[1] for i in x], ArrayType(StringType()))('items')) word2Vec = Word2Vec(vectorSize=128, minCount=10, inputCol="items", outputCol="result") model = word2Vec.fit(data.repartition(1000)) return model
def to_json_df(self, file_stream_df): """Converts the DataFrame stream to a JSON format. Args: file_stream_df (DataFrame): The DataFrame. Returns: A dataframe which holds the data in a JSON format. """ return file_stream_df.select(to_json(struct([file_stream_df[x] for x in file_stream_df.columns])).alias("value"))
def _internal(self): internal = self._kdf._internal scol = F.struct(internal.index_spark_columns) return internal.copy( column_labels=[None], data_spark_columns=[scol], data_dtypes=[None], column_label_names=None, )
def test_vectorized_udf_struct_type(self): df = self.spark.range(10) return_type = StructType([ StructField('id', LongType()), StructField('str', StringType())]) def func(id): return pd.DataFrame({'id': id, 'str': id.apply(unicode)}) f = pandas_udf(func, returnType=return_type) expected = df.select(struct(col('id'), col('id').cast('string').alias('str')) .alias('struct')).collect() actual = df.select(f(col('id')).alias('struct')).collect() self.assertEqual(expected, actual) g = pandas_udf(func, 'id: long, str: string') actual = df.select(g(col('id')).alias('struct')).collect() self.assertEqual(expected, actual) struct_f = pandas_udf(lambda x: x, return_type) actual = df.select(struct_f(struct(col('id'), col('id').cast('string').alias('str')))) self.assertEqual(expected, actual.collect())
def test_vectorized_udf_chained_struct_type(self): import pandas as pd df = self.spark.range(10) return_type = StructType([ StructField('id', LongType()), StructField('str', StringType())]) @pandas_udf(return_type) def f(id): return pd.DataFrame({'id': id, 'str': id.apply(unicode)}) g = pandas_udf(lambda x: x, return_type) expected = df.select(struct(col('id'), col('id').cast('string').alias('str')) .alias('struct')).collect() actual = df.select(g(f(col('id'))).alias('struct')).collect() self.assertEqual(expected, actual)
# COMMAND ---------- freq = df.stat.freqItems(["a", "b", "c"], 0.4) freq.collect()[0] # COMMAND ---------- # MAGIC %md Per above `{a = 1}, {b = 2}, {c = 1, 3}` are frequent items, note that `{a = 65}` and `{b = 130}` are false positives. # MAGIC # MAGIC You can also find frequent items for column combinations, by creating a composite column using the struct function: # COMMAND ---------- from pyspark.sql.functions import struct freq = df.withColumn('ab', struct('a', 'b')).stat.freqItems(['ab'], 0.4) freq.collect()[0] # COMMAND ---------- # MAGIC %md From the above example, the combination of `a=99 and b=198`, and `a=1 and b=2` appear frequently in this dataset. Note that `a=99 and b=198` is a false positive. # COMMAND ---------- # MAGIC %md ### Mathematical Functions # MAGIC Spark 1.4 also added a suite of mathematical functions. Users can apply these to their columns with ease. The list of math functions that are supported come from [this file](https://github.com/apache/spark/blob/efe3bfdf496aa6206ace2697e31dd4c0c3c824fb/python/pyspark/sql/functions.py#L109). The inputs need to be columns functions that take a single argument, such as `cos, sin, floor, ceil`. For functions that take two arguments as input, such as `pow, hypot`, either two columns or a combination of a double and column can be supplied. # COMMAND ---------- from pyspark.sql.functions import * df = sqlContext.range(0, 10).withColumn('uniform', rand(seed=10) * 3.14)
# COMMAND ---------- fill_cols_vals = {"StockCode": 5, "Description" : "No Value"} df.na.fill(fill_cols_vals) # COMMAND ---------- df.na.replace([""], ["UNKNOWN"], "Description") # COMMAND ---------- from pyspark.sql.functions import struct complexDF = df.select(struct("Description", "InvoiceNo").alias("complex")) complexDF.createOrReplaceTempView("complexDF") # COMMAND ---------- from pyspark.sql.functions import split df.select(split(col("Description"), " ")).show(2) # COMMAND ---------- df.select(split(col("Description"), " ").alias("array_col"))\ .selectExpr("array_col[0]").show(2)