def registerFunction(self, name, f, returnType=StringType()): """Registers a python function (including lambda function) as a UDF so it can be used in SQL statements. In addition to a name and the function itself, the return type can be optionally specified. When the return type is not given it default to a string and conversion will automatically be done. For any other return type, the produced object must match the specified type. :param name: name of the UDF :param f: python function :param returnType: a :class:`pyspark.sql.types.DataType` object :return: a wrapped :class:`UserDefinedFunction` >>> strlen = spark.catalog.registerFunction("stringLengthString", len) >>> spark.sql("SELECT stringLengthString('test')").collect() [Row(stringLengthString(test)=u'4')] >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect() [Row(stringLengthString(text)=u'3')] >>> from pyspark.sql.types import IntegerType >>> _ = spark.catalog.registerFunction("stringLengthInt", len, IntegerType()) >>> spark.sql("SELECT stringLengthInt('test')").collect() [Row(stringLengthInt(test)=4)] >>> from pyspark.sql.types import IntegerType >>> _ = spark.udf.register("stringLengthInt", len, IntegerType()) >>> spark.sql("SELECT stringLengthInt('test')").collect() [Row(stringLengthInt(test)=4)] """ udf = UserDefinedFunction(f, returnType, name) self._jsparkSession.udf().registerPython(name, udf._judf) return udf._wrapped()
def get_final_uber_df(uber_df): """Converts json to Dataframe and puts the data in the most accessabe format""" roundit = UserDefinedFunction(lambda x: int(x)/2,IntegerType()) joined = uber_df.select(roundit(uber_df["galvanize.time"]).alias("main_time")) for start,end in rides.items(): df = uber_df.select(roundit(uber_df["{}.time".format(start)]).alias("time"),\ uber_df["{}.{}.surge_multiplier".format(start,end)][2].alias("uber_{}".format(start))) joined = joined.join(df,joined.main_time==df.time).drop("time") return joined.sort("main_time")
def test_udf_init_shouldnt_initialize_context(self): UserDefinedFunction(lambda x: x, StringType()) self.assertIsNone( SparkContext._active_spark_context, "SparkContext shouldn't be initialized when UserDefinedFunction is created." ) self.assertIsNone( SparkSession._instantiatedSession, "SparkSession shouldn't be initialized when UserDefinedFunction is created." )
def test_udf_with_udt(self): row = Row(label=1.0, point=ExamplePoint(1.0, 2.0)) df = self.spark.createDataFrame([row]) self.assertEqual(1.0, df.rdd.map(lambda r: r.point.x).first()) udf = UserDefinedFunction(lambda p: p.y, DoubleType()) self.assertEqual(2.0, df.select(udf(df.point)).first()[0]) udf2 = UserDefinedFunction(lambda p: ExamplePoint(p.x + 1, p.y + 1), ExamplePointUDT()) self.assertEqual(ExamplePoint(2.0, 3.0), df.select(udf2(df.point)).first()[0]) row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0)) df = self.spark.createDataFrame([row]) self.assertEqual(1.0, df.rdd.map(lambda r: r.point.x).first()) udf = UserDefinedFunction(lambda p: p.y, DoubleType()) self.assertEqual(2.0, df.select(udf(df.point)).first()[0]) udf2 = UserDefinedFunction(lambda p: PythonOnlyPoint(p.x + 1, p.y + 1), PythonOnlyUDT()) self.assertEqual(PythonOnlyPoint(2.0, 3.0), df.select(udf2(df.point)).first()[0])
def _transform(self, dataset): SpacyAdvancedTokenize.setup( dataset._sc, dataset.sql_ctx, self.getLang()) func = SpacyAdvancedTokenize.func(self.getLang(), self.getSpacyFields()) retType = SpacyAdvancedTokenize.returnType( self.getLang(), self.getSpacyFields()) udf = UserDefinedFunction(func, retType) return dataset.withColumn( self.getOutputCol(), udf(self.getInputCol()) )
def benchmark(textInputPath, repeat, number): """ Benchmark wordcount on a provided text input path with a given number of repeats each executed number times. :param textInputPath: The input path to perform wordcount on. :param repeat: Number of times to repeat the test :param number: Number of iterations to perform the wordcount per test """ def benchmark_func(func): return timeit.repeat(func, repeat=repeat, number=number) print("Benchmarking wordcount:") tokenize = lambda x: x.split(" ") returnUDFType = ArrayType(StringType()) # session.catalog.registerFunction does not return under spark 2.0 from pyspark.sql.functions import UserDefinedFunction tokenizeUDF = UserDefinedFunction(tokenize, returnUDFType, 'split') session.catalog.registerFunction("split", tokenize, returnUDFType) tokenizeJythonUDF = session.catalog.registerJythonFunction( "split", tokenize, returnUDFType) rdd = sc.textFile(textInputPath) rdd.cache() rdd.count() print("RDD:") print( timeit.repeat(lambda: legacy_word_count(rdd), repeat=repeat, number=number)) rdd.unpersist() df = session.read.text(textInputPath) df.cache() df.count() print("DataFrame Python UDF:") python_udf_test_lambda = lambda: dataframe_udf_word_count(tokenizeUDF, df) python_udf_times = timeit.repeat(python_udf_test_lambda, repeat=repeat, number=number) print(python_udf_times) print("DataFrame Jython UDF:") jython_udf_test_lambda = lambda: dataframe_udf_word_count( tokenizeJythonUDF, df) jython_udf_times = timeit.repeat(jython_udf_test_lambda, repeat=repeat, number=number) print(jython_udf_times) print("DataFrame Scala UDF:") scala_udf_test_lambda = lambda: dataframe_scala_udf_word_count(df) scala_udf_times = timeit.repeat(scala_udf_test_lambda, repeat=repeat, number=number) print(scala_udf_times)
def task1b(self): """1) count of reposts from subscribed and not-subscribed groups""" userWallPosts = self.read_parquet_file("userWallPosts.parquet") userGroupsSubs = self.read_parquet_file("userGroupsSubs.parquet") reposts_t = userWallPosts \ .filter(userWallPosts.is_reposted) \ .select('owner_id', 'repost_info.orig_owner_id')\ .withColumnRenamed("owner_id", "user") reposts = reposts_t.filter(reposts_t["orig_owner_id"] < 0) user_to_group_sub = userGroupsSubs\ .select("user", "group")\ .groupBy("user")\ .agg(collect_set("group"))\ .withColumnRenamed("collect_set(group)", "groups") def contains(id, groups): if not groups: return False if str(id) in groups: return True else: return False contains_udf = UserDefinedFunction(contains) temp = reposts.join(user_to_group_sub, "user", how="left_outer") reposts_from = temp\ .withColumn("from_subscribed", contains_udf(temp.orig_owner_id, temp.groups)) reposts_from_subscribed = reposts_from\ .filter(reposts_from.from_subscribed == 'true')\ .select('user')\ .groupBy('user')\ .count()\ .withColumnRenamed("count", "from_subscribed") reposts_not_from_subscribed = reposts_from \ .filter(reposts_from['from_subscribed'] == 'false') \ .select('user')\ .groupBy("user")\ .count()\ .withColumnRenamed("count", "not_from_subscribed") result_table = reposts_from_subscribed\ .join(reposts_not_from_subscribed, 'user', how="full_outer")\ .fillna(0) return result_table
def filter_df(df_news): role_codes_of_interest = ["COP", "GOV", "JUD", "BUS", "CRM", "DEV", "EDU", "ENV" \ "HLH", "LEG", "MED", "MNC"] # Filter data records that are of interest df_news = df_news.filter(df_news.ActionGeo_CountryCode == 'US') df_news = df_news.filter(df_news.Actor1Code != 'null') df_news = df_news.filter(df_news.Actor1Type1Code.isin(role_codes_of_interest)) df_news.show() #df_news = df_news.filter(df_news.GoldsteinScale != 'null') #df_news = df_news.filter(df_news.GoldsteinScale != '') # There isn't a specific column specifying the state that the event happens in # The state is tied with the country code # So, parse out the state and make it a new column name = 'ActionGeo_ADM1Code' udf = UserDefinedFunction(lambda x: x[:2]+'-'+x[2:], StringType()) df_news = df_news.select(*[udf(column).alias(name) if column == name else \ column for column in df_news.columns]) split_col = F.split(df_news['ActionGeo_ADM1Code'],'-') df_news = df_news.withColumn('action_state', split_col.getItem(1)) # The GoldsteinScale's range is between -10 and 10, but it's mostly sparse # It's hard for a user to make use of it # The scale is normalized and served as a scale between 0 to 100. # Higher value denotes a more positive outcome name = 'GoldsteinScale' min_scale, max_scale = -10.0, 10.0 norm = UserDefinedFunction(lambda x: (x - min_scale)/(max_scale - min_scale), DoubleType()) df_news = df_news.select(*[norm(col).cast(DoubleType()) \ .alias('normg_scale') if col == name else col for col in \ df_news.columns]) df_news = df_news.filter(df_news.action_state != '') df_news = aggregate_job(df_news) return df_news
def test_udf_with_partial_function(self): d = [Row(number=i, squared=i**2) for i in range(10)] rdd = self.sc.parallelize(d) data = self.sqlCtx.createDataFrame(rdd) def some_func(col, param): if col is not None: return col + param pfunc = functools.partial(some_func, param=4) pudf = UserDefinedFunction(pfunc, LongType()) res = data.select(pudf(data['number']).alias('plus_four')) self.assertEqual(res.agg({'plus_four': 'sum'}).collect()[0][0], 85)
def add_state_column(self, df_news): # There isn't a specific column specifying the state that the event happens in # The state is tied with the country code # So, parse out the state and make it a new column name = 'ActionGeo_ADM1Code' udf = UserDefinedFunction(lambda x: x[:2] + '-' + x[2:], StringType()) df_news = df_news.select(*[ udf(column).alias(name) if column == name else column for column in df_news.columns ]) split_col = F.split(df_news['ActionGeo_ADM1Code'], '-') df_news = df_news.withColumn('action_state', split_col.getItem(1)) return df_news
def test_udf_with_callable(self): d = [Row(number=i, squared=i**2) for i in range(10)] rdd = self.sc.parallelize(d) data = self.sqlCtx.createDataFrame(rdd) class PlusFour: def __call__(self, col): if col is not None: return col + 4 call = PlusFour() pudf = UserDefinedFunction(call, LongType()) res = data.select(pudf(data['number']).alias('plus_four')) self.assertEqual(res.agg({'plus_four': 'sum'}).collect()[0][0], 85)
def user_similarities_for_collab(users): # vectors already normalized during preprocessing udf = UserDefinedFunction(lambda arr: float(arr[0].dot(arr[1])), DoubleType()) users2 = users.select(*(col(x).alias(x + '_2') for x in users.columns)) users2.cache() similarities = users.join(users2, col('user_id') != col('user_id_2')) similarities = similarities.withColumn( "similarity_unnormed", udf(array(similarities.features, similarities.features_2))) mean_sim, sttdev_sim = similarities.select( mean("similarity_unnormed"), stddev("similarity_unnormed")).first() similarities = similarities.withColumn( "similarity", (col("similarity_unnormed") - mean_sim) / sttdev_sim) return similarities
def task4b_friends(self): """3) aggregate (e.g. count, max, mean) characteristics for comments and likes (separtely) made by (a) friends and (b) followers per user""" friends = self.read_parquet_file("friends.parquet") # userWallPosts = self.read_parquet_file("userWallPosts.parquet") userWallComments = self.read_parquet_file("userWallComments.parquet") userWallLikes = self.read_parquet_file("userWallLikes.parquet") user_friends = friends\ .groupBy("profile")\ .agg(collect_set("follower"))\ .withColumnRenamed("collect_set(follower)", "friends")\ .select("profile", "friends") comments = userWallComments.select("post_owner_id", "from_id", "post_id") def contains(id, groups): if not groups: return False if str(id) in groups: return True else: return False contains_udf = UserDefinedFunction(contains) post_comment_to_relation = comments\ .withColumnRenamed("post_owner_id", "profile")\ .join(user_friends, "profile", how="left_outer")\ .withColumn("is_from_friend", contains_udf(col("from_id"), col("friends")))\ .select("profile", "is_from_friend", "post_id")\ .filter(col("is_from_friend") == "true")\ comments_from_friends_per_post = post_comment_to_relation\ .groupBy("post_id")\ .count() result_table = post_comment_to_relation\ .select("profile", "post_id")\ .join(comments_from_friends_per_post, "post_id")\ .groupBy("profile")\ .agg(max("count"), mean("count"), sum("count"))\ .sort(desc("sum(count)")) result_table.show()
def normalize_goldstein(self, df_news): # The GoldsteinScale's range is between -10 and 10, but it's mostly sparse # It's hard for a user to make use of it # The scale is normalized and served as a scale between 0 to 100. # Higher value denotes a more positive outcome name = 'GoldsteinScale' min_scale, max_scale = -10.0, 10.0 norm = UserDefinedFunction( lambda x: (x - min_scale) / (max_scale - min_scale), DoubleType()) df_news = df_news.select(*[ norm(col).cast(DoubleType()).alias('normg_scale') if col == name else col for col in df_news.columns ]) df_news = df_news.filter(df_news.action_state != '')
def test_udf_defers_judf_initialization(self): # This is separate of UDFInitializationTests # to avoid context initialization # when udf is called f = UserDefinedFunction(lambda x: x, StringType()) self.assertIsNone( f._judf_placeholder, "judf should not be initialized before the first call.") self.assertIsInstance(f("foo"), Column, "UDF call should return a Column.") self.assertIsNotNone( f._judf_placeholder, "judf should be initialized after UDF has been called.")
def M3( ): #3) aggregate (e.g. count, max, mean) characteristics for comments and likes (separtely) made by (a) friends (b) followers per post def contains(id, groups): #Filter if not groups: return False if str(id) in groups: return True else: return False contains_udf = UserDefinedFunction(contains) user_friends = friends\ .groupBy("profile")\ .agg(collect_set("follower").alias("friends"))\ .select("profile", "friends") #Get set of friends for an user comments = uWallC.select("post_owner_id", "from_id", "post_id") #Projection likes = uWallL.filter(uWallL.itemType=="post")\ .selectExpr("ownerId as post_owner_id","likerId as from_id","itemId as post_id") #Join users with comments post_comment_to_relation = comments\ .withColumnRenamed("post_owner_id", "profile")\ .join(user_friends, "profile", "left_outer")\ .withColumn("is_from_friend", contains_udf(col("from_id"), col("friends")))\ .select("profile", "is_from_friend", "post_id")\ .filter(col("is_from_friend") == "true") #Join users with likes post_comment_to_relation = likes\ .withColumnRenamed("post_owner_id", "profile")\ .join(user_friends, "profile", "left_outer")\ .withColumn("is_from_friend", contains_udf(col("from_id"), col("friends")))\ .select("profile", "is_from_friend", "post_id")\ .filter(col("is_from_friend") == "true") comments_from_friends_per_post = post_comment_to_relation.groupBy("post_id")\ .count()\ .groupBy("post_id")\ .agg(max("count").alias("cMax"), mean("count").alias("cMean"), sum("count").alias("cSum"))\ likes_from_friends_per_post = post_comment_to_relation.groupBy("post_id").count()\ .groupBy("post_id")\ .agg(max("count").alias("lMax"), mean("count").alias("lMean"), sum("count").alias("lSum"))\ return comments_from_friends_per_post.join(likes_from_friends_per_post, "post_id")
def create_partition_columns(df): udf_getIntervalTime = UserDefinedFunction(getIntervalTime, StringType()) df = df.withColumn( 'ingestion_year', udf_getIntervalTime(df.creationdate, lit(IS_DATE), lit(YEAR_INDEX))) df = df.withColumn( 'ingestion_month', udf_getIntervalTime(df.creationdate, lit(IS_DATE), lit(MONTH_INDEX))) df = df.withColumn( 'ingestion_day', udf_getIntervalTime(df.creationdate, lit(IS_DATE), lit(DAY_INDEX))) df = df.withColumn( 'ingestion_hour', udf_getIntervalTime(df.creationdate, lit(IS_TIME), lit(HOUR_INDEX))) return df
def M1(): #1) count of reposts from subscribed and not-subscribed groups reposts_t = uWallP \ .filter(uWallP.is_reposted) \ .select('owner_id', 'repost_info.orig_owner_id')\ .withColumnRenamed("owner_id", "user") reposts = reposts_t.filter(reposts_t["orig_owner_id"] < 0) user_to_group_sub = uGroupsS\ .select("user", "group")\ .groupBy("user")\ .agg(collect_set("group"))\ .withColumnRenamed("collect_set(group)", "groups") def contains(id, groups): if not groups: return False if str(id) in groups: return True else: return False contains_udf = UserDefinedFunction(contains) temp = reposts.join(user_to_group_sub, "user", "left_outer") reposts_from = temp\ .withColumn("from_subscribed", contains_udf(temp.orig_owner_id, temp.groups)) reposts_from_subscribed = reposts_from\ .filter(reposts_from.from_subscribed == 'true')\ .select('user')\ .groupBy('user')\ .count()\ .withColumnRenamed("count", "from_subscribed") reposts_not_from_subscribed = reposts_from \ .filter(reposts_from['from_subscribed'] == 'false') \ .select('user')\ .groupBy("user")\ .count()\ .withColumnRenamed("count", "not_from_subscribed") reposts_count = reposts_from_subscribed\ .join(reposts_not_from_subscribed, 'user', "full_outer")\ .fillna(0) return reposts_count
def _kFold(self, dataset): nFolds = self.getOrDefault(self.numFolds) foldCol = self.getOrDefault(self.foldCol) datasets = [] if not foldCol: # Do random k-fold split. seed = self.getOrDefault(self.seed) h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) for i in range(nFolds): validateLB = i * h validateUB = (i + 1) * h condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) validation = df.filter(condition) train = df.filter(~condition) datasets.append((train, validation)) else: # Use user-specified fold numbers. def checker(foldNum): if foldNum < 0 or foldNum >= nFolds: raise ValueError( "Fold number must be in range [0, %s), but got %s." % (nFolds, foldNum)) return True checker_udf = UserDefinedFunction(checker, BooleanType()) for i in range(nFolds): training = dataset.filter( checker_udf(dataset[foldCol]) & (col(foldCol) != lit(i))) validation = dataset.filter( checker_udf(dataset[foldCol]) & (col(foldCol) == lit(i))) if training.rdd.getNumPartitions() == 0 or len( training.take(1)) == 0: raise ValueError("The training data at fold %s is empty." % i) if validation.rdd.getNumPartitions() == 0 or len( validation.take(1)) == 0: raise ValueError( "The validation data at fold %s is empty." % i) datasets.append((training, validation)) return datasets
def remove_attachments(df, cleansed_df): ATTACHMENT = "attachment" def _remove_attachments_key(dic): dic_copy = dic.copy() for key in dic_copy: if (ATTACHMENT in key.lower()): del dic[key] elif (type(dic_copy[key]) == dict): _remove_attachments_key(dic[key]) elif (type(dic_copy[key]) == list): for item in dic_copy[key]: if (type(item) == dict): _remove_attachments_key(item) return dic def _field_cleansing(row, field_name): row = row and json.loads(row) if row: field = row.get(field_name, {}) if type(field) is dict: field = _remove_attachments_key(field) return field for field in df.schema.fields: field_type = field.dataType field_name = field.name if ATTACHMENT in field_name.lower(): df = df.drop(field_name) elif field_type != StringType(): print('Getting schea type for field {}'.format(field_name)) structured_field_type = get_schema_type(field_name, cleansed_df) if structured_field_type: print('Everything was OK to {}'.format(field_name)) udf_attachments_cleansing = UserDefinedFunction( lambda row: _field_cleansing(row, field_name), structured_field_type) df = df.withColumn(field_name, udf_attachments_cleansing("ToJSON")) return df
def to_all_users_df(ugr_rdd, ugr_df): client = MongoClient() coll = client.bgg.game_comments username_dict = un_pickle_user_dict() count = 1 schema = StructType([ StructField('user_id', IntegerType()), StructField('game_id', IntegerType()), StructField('rating', DoubleType()) ]) all_users_recs_df = spark.createDataFrame(sc.emptyRDD(), schema) print(all_users_recs_df) for username, user_id in username_dict.items(): if count > 10: break print("current iteration:", count) rated_game_ids_lst = [ x["game_id"] for x in list(coll.find({"username": username})) ] # create RDD where games are not rated which will include every entry that is not a game rated by the user_id user_unrated_games_rdd = ugr_rdd.filter( lambda x: x[1] not in rated_game_ids_lst).map( lambda x: (user_id, int(x[1]), x[2])) #create data frame user_unrated_df = spark.createDataFrame(user_unrated_games_rdd, schema) name = 'rating' udf = UserDefinedFunction(lambda x: 'new_value', DoubleType()) new_test_df = user_unrated_df.select(*[ udf(column).alias(name) if column == name else column for column in user_unrated_df.columns ]) new_test_df = new_test_df.na.fill(0.0) #drop the duplicates and at this point we have a unique df for the currenty user in iteration unique_games_df = new_test_df.dropDuplicates(['game_id']) # now iterate again and use .union on current data frame, repeat and return all_users_recs_df = all_users_recs_df.union(unique_games_df) print("rows in dataframe:", all_users_recs_df.count()) count += 1 return all_users_recs_df
def predict_with_multiple_version(df, versions, model_date, spid): columns = df.columns for version_name in versions: version_infor = MODEL_VERSION_INFO[version_name] convmaps = get_convmap_dics(version_name, model_date) for k in convmaps[str(spid)].keys(): df = df.withColumn(k + '_' + version_name, categorical_conv(convmaps[str(spid)][k])(col(k))) name_features = version_infor['func_feature_names'](df) name_features = convert_name_features(name_features, version_name, list(convmaps[str(spid)])) df = VectorAssembler(inputCols=name_features, outputCol='features_%s' % version_name).transform(df) print(df.columns) predicted_list = [] for version_name in versions: model = get_model(version_name, spid, model_date) prob_col_name = 'prob_%s' % version_name df = df.withColumn('features', col('features_%s' % version_name)) df = model.transform(df).withColumn(prob_col_name, UserDefinedFunction(lambda x: x.tolist()[1], DoubleType())( col('probability'))) predicted_list.append(version_name) df = df.select(columns + ['prob_%s' % v for v in predicted_list] + ['features_%s' % v for v in versions]) df = df.select(columns + ['prob_%s' % v for v in versions]) return df
def preprocess_user_data(df): drop_list = [ 'compliment_cool', 'compliment_cute', 'compliment_hot', 'compliment_list', 'compliment_more', 'compliment_note', 'compliment_photos', 'compliment_plain', 'compliment_profile', 'compliment_writer', 'compliment_funny', 'friends', 'name' ] users = df.select([c for c in df.columns if c not in drop_list]) # get all features to long/double indexer = StringIndexer(inputCol='user_id', outputCol='user_id_int') userIndexModel = indexer.fit(users) users = userIndexModel.transform(users) users = users.withColumn("yelping_since_date", unix_timestamp(col("yelping_since"))) users = users.drop('yelping_since') udf = UserDefinedFunction(lambda x: len(x.split(',')) if x != '' else 0, IntegerType()) users = users.withColumn('num_years_elite', udf(users.elite)) users = users.drop('elite') # assemble vector + standardize feat_cols = [ c for c in users.columns if c not in ['user_id', 'user_id_int'] ] assembler = VectorAssembler(inputCols=feat_cols, outputCol="vector") scaler = StandardScaler(inputCol="vector", outputCol="features_unnormed", withStd=True, withMean=True) users = assembler.transform(users) users = scaler.fit(users).transform(users) normalizer = Normalizer(inputCol="features_unnormed", outputCol="features") users = normalizer.transform(users) users = users.select('user_id', 'user_id_int', 'average_stars', 'features') return users
def crear_features(): from pyspark.sql import functions as f base = get_clean_data() udf = UserDefinedFunction(lambda x: re.sub('""', '0', str(x)), StringType()) base = base.select(*[udf(column).alias(column) for column in base.columns]) base = base.withColumn( 'findesemana', f.when(f.col('dayofweek') == 5, 1).when(f.col('dayofweek') == 6, 1).when(f.col('dayofweek') == 7, 1).otherwise(0)) base = base.withColumn( 'quincena', f.when(f.col('dayofmonth') == 15, 1).when(f.col('dayofmonth') == 14, 1).when(f.col('dayofmonth') == 16, 1).when( f.col('dayofmonth') == 29, 1).when(f.col('dayofmonth') == 30, 1).when( f.col('dayofmonth') == 31, 1).when(f.col('dayofmonth') == 1, 1).when( f.col('dayofmonth') == 2, 1).when(f.col('dayofmonth') == 3, 1).otherwise(0)) #base = base.withColumn('dephour',f.when(f.length('crsdeptime')==3,f.col('crsdeptime').substr(0,1).cast("float")).otherwise(f.col('crsdeptime').substr(0,2).cast("float")) ) base = base.withColumn('dephour', f.when(f.col('dayofweek') == 5, 1).otherwise(0)) base = base.withColumn( 'seishoras', f.when(f.col('dephour') == 6, 1).when(f.col('dephour') == 12, 1).when(f.col('dephour') == 18, 1).when(f.col('dephour') == 0, 1).otherwise(0)) save_rds(base, "semantic.rita") return base
def task7a(self): """7) count of open / closed (e.g. private) groups a user participates in""" groupsProfiles = self.read_parquet_file("groupsProfiles.parquet") userGroupsSubs = self.read_parquet_file("userGroupsSubs.parquet") invert_id = UserDefinedFunction(lambda x: -int(x)) user_to_group = userGroupsSubs \ .select("user", invert_id("group")) \ .withColumnRenamed("<lambda>(group)", "group")\ .dropDuplicates() group_type = groupsProfiles\ .select("id", "is_closed")\ .withColumnRenamed("id", "group")\ .dropDuplicates() user_to_group_type = user_to_group\ .join(group_type, "group")\ opened_groups = user_to_group_type\ .filter(user_to_group_type['is_closed'] == 0)\ .groupBy("user")\ .count()\ .withColumnRenamed("count", "opened") closed_groups = user_to_group_type\ .filter(user_to_group_type['is_closed'] > 0)\ .groupBy("user")\ .count()\ .withColumnRenamed("count", "closed") result_table = opened_groups\ .join(closed_groups, "user", how="full_outer")\ .fillna(0) return result_table
def M4( ): #4) aggregate (e.g. count, max, mean) characteristics for comments and likes (separtely) made by (a) friends and (b) followers per user def contains(id, groups): #Filter if not groups: return False if str(id) in groups: return True else: return False contains_udf = UserDefinedFunction(contains) user_friends = friends\ .groupBy("profile")\ .agg(collect_set("follower").alias("friends"))\ .select("profile", "friends") #Get set of friends for an user comments = uWallC.select("post_owner_id", "from_id", "post_id") #Projection #Join comments with user-SetOfFriends post_comment_to_relation = comments\ .withColumnRenamed("post_owner_id", "profile")\ .join(user_friends, "profile", "left_outer")\ .withColumn("is_from_friend", contains_udf(col("from_id"), col("friends")))\ .select("profile", "is_from_friend", "post_id")\ .filter(col("is_from_friend") == "true") comments_from_friends_per_post = post_comment_to_relation.groupBy( "post_id").count() aggregates_results = post_comment_to_relation\ .select("profile", "post_id")\ .join(comments_from_friends_per_post, "post_id")\ .groupBy("profile")\ .agg(max("count"), mean("count"), sum("count"))\ .sort(desc("sum(count)")) return aggregates_results
# mapping for qualification qualificationMap = {"Graduate": 1.0, "Not Graduate": 0.0} #mapping for property area propertyAreaMap = {"Rural": 0.0, "Urban": 1.0} sc = SparkContext() sqlContext = SQLContext(sc) data = sqlContext.read.load('processed_data.csv', format='com.databricks.spark.csv', header=True, inferSchema='true') data.cache() modified_property_area = UserDefinedFunction( lambda m: 'Rural' if m == 'Semiurban' else m, StringType()) #print(modified_property_area) data = data.withColumn('property_area_modified', modified_property_area(data['property_area'])) marital_status_udf = UserDefinedFunction(lambda m: yes_no_map[m], DoubleType()) gender_udf = UserDefinedFunction(lambda g: genderMap.get(g, 0), DoubleType()) self_employed_udf = UserDefinedFunction(lambda se: yes_no_map.get(se, 0), DoubleType()) qualification_udf = UserDefinedFunction(lambda q: qualificationMap.get(q, 0), DoubleType()) self_employed_encode = UserDefinedFunction( lambda at: propertyAreaMap.get(at, 0), DoubleType()) data = data.withColumn('marital_status_encoded',marital_status_udf(data['marital_status']))\ .withColumn('gender_encoded',gender_udf(data['gender'])) \
def test_udf_registration_return_type_not_none(self): with QuietTest(self.sc): with self.assertRaisesRegexp(TypeError, "Invalid returnType"): self.spark.catalog.registerFunction( "f", UserDefinedFunction(lambda x, y: len(x) + y, StringType()), StringType())
def test_udf_registration_return_type_none(self): two_args = self.spark.catalog.registerFunction( "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y, "integer"), None) self.assertEqual(two_args.deterministic, True) [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect() self.assertEqual(row[0], 5)
def test_udf3(self): two_args = self.spark.catalog.registerFunction( "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y)) self.assertEqual(two_args.deterministic, True) [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect() self.assertEqual(row[0], u'5')
.master("local")\ .appName("Bank Marketing Prediction")\ .getOrCreate() dataset_all = spark.read.load("paper2/data/bank.csv", format="com.databricks.spark.csv", header="true", inferSchema="true") dataset_all.printSchema() pd.DataFrame(dataset_all.take(5), columns=dataset_all.columns).transpose() dataset_all.describe().toPandas().transpose() #Normalized data start binary_yes_no_map = {'yes': 1.0, 'no': 0.0} toNum = UserDefinedFunction(lambda k: binary_yes_no_map[k], DoubleType()) job_map = {'admin.':0.0, 'blue-collar':1.0, 'entrepreneur':2.0, 'housemaid':3.0, \ 'management':4.0, 'retired':5.0, 'self-employed':6.0, 'services':7.0, \ 'student':8.0, 'technician':9.0, 'unemployed':10.0, 'unknown':11.0} jobToNum = UserDefinedFunction(lambda k: job_map[k], DoubleType()) marital_map = {'divorced': 0.0, 'married': 1.0, 'single': 2.0} maritalToNum = UserDefinedFunction(lambda k: marital_map[k], DoubleType()) education_map = { 'primary': 0.0, 'secondary': 1.0, 'tertiary': 2.0, 'unknown': 3.0 }