def test_collect_functions(self): df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) from pyspark.sql import functions self.assertEqual( sorted(df.select(functions.collect_set(df.key).alias('r')).collect()[0].r), [1, 2]) self.assertEqual( sorted(df.select(functions.collect_list(df.key).alias('r')).collect()[0].r), [1, 1, 1, 2]) self.assertEqual( sorted(df.select(functions.collect_set(df.value).alias('r')).collect()[0].r), ["1", "2"]) self.assertEqual( sorted(df.select(functions.collect_list(df.value).alias('r')).collect()[0].r), ["1", "2", "2", "2"])
def runAggregateFunctions(spark, df1, df2): # collect_list, collect_set doubledDf1 = df1.union(df1) doubledDf1.select(functions.collect_list(doubledDf1["name"])).show(truncate=False) doubledDf1.select(functions.collect_set(doubledDf1["name"])).show(truncate=False) # count, countDistinct doubledDf1.select(functions.count(doubledDf1["name"]), functions.countDistinct(doubledDf1["name"])).show( truncate=False) # sum df2.printSchema() df2.select(sum(df2["price"])).show(truncate=False) # grouping, grouping_id df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping(df2["store"])).show(truncate=False) df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping_id(df2["store"], df2["product"])).show( truncate=False)
def _get_top_k_items( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, k=DEFAULT_K ): """Get the input customer-item-rating tuple in the format of Spark DataFrame, output a Spark DataFrame in the dense format of top k items for each user. NOTE: if it is implicit rating, just append a column of constants to be ratings. Args: dataframe (spark.DataFrame): DataFrame of rating data (in the format of customerID-itemID-rating tuple). col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. k (int): number of items for each user. Return: spark.DataFrame: DataFrame of top k items for each user. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc()) # this does not work for rating of the same value. items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .groupby(col_user) .agg(F.collect_list(col_item).alias(col_prediction)) ) return items_for_user
def _get_relevant_items_by_threshold( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, threshold=DEFAULT_THRESHOLD ): """Get relevant items for each customer in the input rating data. Relevant items are defined as those having ratings above certain threshold. The threshold is defined as a statistical measure of the ratings for a user, e.g., median. Args: dataframe: Spark DataFrame of customerID-itemID-rating tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. threshold (float): threshold for determining the relevant recommended items. This is used for the case that predicted ratings follow a known distribution. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ items_for_user = ( dataframe .orderBy(col_rating, ascending=False) .where(col_rating + " >= " + str(threshold)) .select( col_user, col_item, col_rating ) .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, col_prediction) .dropDuplicates() ) return items_for_user
def _get_relevant_items_by_timestamp( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, col_prediction=DEFAULT_PREDICTION_COL, k=DEFAULT_K ): """Get relevant items for each customer defined by timestamp. Relevant items are defined as k items that appear mostly recently according to timestamps. Args: dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_timestamp (str): column name for timestamp. col_prediction (str): column name for prediction. k: number of relevent items to be filtered by the function. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc()) items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, col_prediction) .dropDuplicates([col_user, col_prediction]) ) return items_for_user
def loss(self, period, agg_method, loss_method): end_date = self.raw_data.select( F.date_format( F.max(self.ds), self.ds_format)).rdd.map(lambda x: list(x)).first()[0] end_date_be = (parse(end_date) - datetime.timedelta(period)).strftime('%Y-%m-%d') calculate_loss = self.raw_data.filter(F.col(self.ds) < end_date_be) loss_result_rdd = calculate_loss.map(lambda x: ((x[0], x[1]), (x[2], x[3]))).groupByKey(). \ map(lambda x: _run(x, period)) pre_loss_sp = loss_result_rdd.toDF().toDF(self.key, 'ts_type', self.target) check_loss_sp = self.raw_data.where( ''' {0} >= '{1}' and {0} < '{2}' '''.format( 'dt', end_date_be, end_date)) date_list = get_date_range(end_date_be, period) date_sp = spark.createDataFrame(map(lambda x: [x], date_list), ['dt']) main_sp = pre_loss_sp.select('sku').drop_duplicates().crossJoin( date_sp) pre_sp = split_pre_data(pre_loss_sp, date_list, ['sku', 'dt', 'pre']) main_sp = main_sp.join(check_loss_sp, on=['sku', 'dt'], how='left').join(pre_sp, on=['sku', 'dt'], how='left').na.fill(0) main_sp = main_sp.join(pre_sp, on=['sku']).join(check_loss_sp, on=['sku', 'dt'], how='left') loss_sp = main_sp.groupBy('sku').agg( F.udf(agg_cal_loss)(F.collect_list( F.struct(F.col('dt'), F.col('sale'), F.col('pre'))), F.lit(agg_method), F.lit(loss_method)).alias('loss')) return loss_sp
def daily_compute(self, busi_date): """ """ spark = self.spark # 计算T-1日(交易日历) yesterday = get_date( self.date_order, self.order_date, busi_date, -1) unclose = spark.sql("select * from %s.%s where busi_date='%s'" % (self.fdata, self.unclose_table, yesterday)) trd = spark.sql("select * from %s.%s where busi_date='%s'" % (self.odata, self.cash_flow_table, busi_date)) trd = trd.filter("trd_type='long_related' and prd_no!='0.0'") asset = spark.sql("select * from %s.%s where busi_date='%s'" % (self.odata, self.asset_table, busi_date)) asset = asset.select("trade_id", "secu_acc_id", "prd_no", "qty", "mkt_val") asset = asset.filter("prd_no!='0.0'") trd = trd.withColumn( "now_trd", func.struct("trd_qty", "trd_cash_flow", "timestamp")) trd = trd.groupBy("trade_id", "secu_acc_id", "prd_no") \ .agg(func.collect_list("now_trd").alias("now_trd")) trd.persist(StorageLevel.DISK_ONLY) trd.count() df = full_outer_join(unclose, trd, ["trade_id", "secu_acc_id", "prd_no"]) df = full_outer_join(df, asset, ["trade_id", "secu_acc_id", "prd_no"]) df.persist(StorageLevel.DISK_ONLY) df.count() data = df.rdd.map(lambda x: Row(**long_compute(x.asDict(recursive=True), busi_date))) if data.count() > 0: data = data.toDF() close = data.filter("remain_qty = 0") close = close.withColumn("close_date", close.busi_date) self.check_data(close) unclose = data.filter("remain_qty != 0") self.save_close_data(close, busi_date) self.save_unclose_data(unclose, busi_date) else: print "清仓股票收益做多没有输出"
def get_client_similar_cigar(spark): # 注册UDF函数,选择三个月的数据分析 strlen = spark.udf.register("strlen", lambda x: len(x)) begin_time = (date.today() - timedelta(days=90)).strftime("%Y%m%d") lines = spark.sql( f"SELECT co_num, item_id FROM DB2_DB2INST1_CO_CO_LINE WHERE \ born_date > {begin_time} AND qty_rsn > 0 AND qty_ord > 0" ) # 构造训练集 # 单次订单少于2个品规的,不进行训练 items_df = lines.groupBy("co_num").agg(f.collect_list("item_id").alias("sentence")).select("sentence") items_df = items_df.filter(strlen("sentence") >= 2) # Word2Vec需要调整参数 word2Vec = Word2Vec(vectorSize=30, seed=88,minCount=1, inputCol="sentence", outputCol="model") model = word2Vec.fit(items_df) # 构造测试集 raw_data = lines.select(col("item_id").alias("items")).distinct() raw_data = raw_data.dropna() raw_data = list(map(lambda x: x['items'], raw_data.collect())) predict_df = pd.DataFrame(raw_data, columns=["origin"]) predict_df["near_list"] = predict_df["origin"].apply(lambda x: combineWithSyn(x, model)) out = [] for v in predict_df.values: df = pd.DataFrame(v[1], columns=["other", "distance"]) df["origin"] = v[0] out.append(df.copy()) df = pd.concat(out) # 生成对应的sparkDataFrame 烟 相似的烟 相似度 df = spark.createDataFrame(df[["origin", "other", "distance"]], ["origin", "other", "distance"]) return df
def get_variants_df_with_case_duration(df, parameters=None): """Gets variants dataframe from the Spark dataframe, with case duration that is included """ if parameters is None: parameters = {} case_id_glue = parameters[ PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME activity_key = parameters[ PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY timestamp_key = parameters[ PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY ordered_df = df.orderBy(timestamp_key).select(case_id_glue, timestamp_key, activity_key) grouped_df = ordered_df.groupby(case_id_glue) df1 = grouped_df.agg(F.collect_list(activity_key).alias("variant")) df1 = df1.withColumn("variant", F.concat_ws(",", "variant")).orderBy(case_id_glue) start_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key)) first_eve_df = ordered_df.join(F.broadcast(start_df), start_df.columns) end_df = grouped_df.agg(F.max(timestamp_key).alias(timestamp_key)) last_eve_df = ordered_df.join(F.broadcast(end_df), end_df.columns) last_eve_df = last_eve_df.withColumnRenamed(timestamp_key, timestamp_key + "_2") last_eve_df = last_eve_df.withColumnRenamed(activity_key, activity_key + "_2") stacked_df = first_eve_df.join(last_eve_df, case_id_glue).orderBy(case_id_glue) stacked_df = stacked_df.withColumn( "caseDuration", F.unix_timestamp(stacked_df[timestamp_key + "_2"]) - F.unix_timestamp(stacked_df[timestamp_key])) new_df = df1.join(stacked_df, case_id_glue) return new_df
def shuffle_generator_method_2(df): df = df.select( ['source_id', 'cited_source_uid', 'reference_issn', 'reference_year']) df = df.sort("reference_year") df_g = df.groupby('reference_year').agg(collect_list(df.cited_source_uid)) df_g = df_g.withColumnRenamed('collect_list(cited_source_uid)', 'csi') # df_g.show() df_g = df_g.select(shuffle(df_g.csi).alias('s_csi')) # df_g.show() df_g = df_g.select(explode(df_g.s_csi).alias('s_csi')) # df_g.show() # df.show() df = df.withColumn('id', monotonically_increasing_id()) df_g = df_g.withColumn('id', monotonically_increasing_id()) df = df.join(df_g, df.id == df_g.id, 'inner').select(df.source_id, df.cited_source_uid, df.reference_issn, df_g.s_csi, df.reference_year) # df.show() # print("lenght of dataframe before filter ",df.count()) #Filter rows for duplication # df=df.filter("cited_source_uid != s_csi") filter_df = df.filter("cited_source_uid == s_csi").select( 'source_id').distinct() df = df.join(filter_df, ['source_id'], 'leftanti').select(df.source_id, df.reference_issn, df.cited_source_uid, df.s_csi, df.reference_year) df = df.withColumnRenamed('s_csi', 's_cited_source_uid') return df
def get_top_10_coinstalls(addons_expanded_day): def str_map_to_dict(m): result = {} for i in m: k, v = i.split("=") result[k] = v return result def format_row(row): return Row( addon_id=row.addon_id, top_10_coinstalls=str_map_to_dict(row.top_10_coinstalls), ) w = Window().partitionBy("addon_id").orderBy(F.col("count").desc()) d = ( addons_expanded_day.join( addons_expanded_day.filter("is_system=false").withColumnRenamed( "addon_id", "coaddon" ), on="client_id", ) .groupby("client_id", "addon_id", "coaddon") .count() .withColumn("rn", (F.row_number().over(w) - F.lit(1))) # start at 0 .filter("rn BETWEEN 1 and 10") # ignore 0th addon (where coaddon==addon_id) .groupby("addon_id") .agg( F.collect_list(F.concat(F.col("rn"), F.lit("="), "coaddon")).alias( "top_10_coinstalls" ) ) .rdd.map(format_row) .toDF() ) return d
def _get_top_k_items( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, k=DEFAULT_K, ): """Get the input customer-item-rating tuple in the format of Spark DataFrame, output a Spark DataFrame in the dense format of top k items for each user. .. note:: if it is implicit rating, just append a column of constants to be ratings. Args: dataframe (pyspark.sql.DataFrame): DataFrame of rating data (in the format of customerID-itemID-rating tuple). col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. k (int): number of items for each user. Return: pyspark.sql.DataFrame: DataFrame of top k items for each user. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc()) # this does not work for rating of the same value. items_for_user = (dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")).where( col("rank") <= k).groupby(col_user).agg( F.collect_list(col_item).alias(col_prediction))) return items_for_user
def group_batched_logs(df_logs): # group logs from did + interval_time + keyword. # group 1: group by did + interval_starting_time + keyword df = df_logs.groupBy( 'aid', 'interval_starting_time', 'keyword_index').agg( first('keyword').alias('keyword'), first('age').alias('age'), first('gender_index').alias('gender_index'), first('aid_bucket').alias('aid_bucket'), fn.sum(col('is_click')).alias('kw_clicks_count'), fn.sum(fn.when(col('is_click') == 0, 1).otherwise(0)).alias('kw_shows_count'), ) # df = df.orderBy('keyword_index') df = df.withColumn( 'kwi_clicks_count', concat_ws(":", col('keyword_index'), col('kw_clicks_count'))) df = df.withColumn( 'kwi_shows_count', concat_ws(":", col('keyword_index'), col('kw_shows_count'))) df = df.withColumn( 'kw_clicks_count', concat_ws(":", col('keyword'), col('kw_clicks_count'))) df = df.withColumn( 'kw_shows_count', concat_ws(":", col('keyword'), col('kw_shows_count'))) # group 2: group by did + interval_starting_time df = df.groupBy('aid', 'interval_starting_time').agg( concat_ws(",", collect_list('keyword_index')).alias('kwi'), concat_ws( ",", collect_list('kwi_clicks_count')).alias('kwi_click_counts'), concat_ws( ",", collect_list('kwi_shows_count')).alias('kwi_show_counts'), concat_ws(",", collect_list('keyword')).alias('interval_keywords'), concat_ws( ",", collect_list('kw_clicks_count')).alias('kw_click_counts'), concat_ws(",", collect_list('kw_shows_count')).alias('kw_show_counts'), first('age').alias('age'), first('gender_index').alias('gender_index'), first('aid_bucket').alias('aid_bucket')) return df
def joinDF(rev_df, prod_df, cat): postgres_url = os.getenv('POSTGRES_URL', 'default') postgres_user = os.getenv('POSTGRES_USER', 'default') postgres_pw = os.getenv('POSTGRES_PW', 'default') postgres_properties = {"user": postgres_user, "password": postgres_pw} table_name = cat + "joined" prod_df = prod_df.filter(functions.array_contains("categories", cat)) joined_df = rev_df.join(prod_df, rev_df.asin == prod_df.productid) joined_df = joined_df.groupby("reviewerid").agg( functions.collect_list("categories").alias("categories")) flatAll_udf = functions.udf(flatAll, ArrayType(StringType())) joined_df = joined_df.withColumn("categories", flatAll_udf(joined_df.categories)) joined_df = joined_df.rdd.flatMap( lambda (user, cats): [(user, cat) for cat in cats]).toDF( ["reviewerid", "category"]) joined_df = joined_df.groupby("reviewerid").pivot("category").count() joined_df = joined_df.na.fill(0) joined_df.write.jdbc(url=postgres_url, table=table_name, mode='overwrite', properties=postgres_properties) return
def create_buckets(sc, dataframe, buckets=20, prediction_col='prediction'): n_buckets = sc.broadcast(buckets) generate_list_udf = F.udf( f=lambda l, minimum, maximum, boundary: ShowResults.create_linspace( data=l, min=minimum, max=maximum, boundary=boundary, buckets=n_buckets.value ), returnType=T.ArrayType( elementType=T.ArrayType( elementType=T.IntegerType(), containsNull=True), containsNull=True ) ) tmp = (dataframe .groupBy(prediction_col, F.col('computed_boundary')) .agg(F.min('distance').alias('min'), F.max('distance').alias('max'), F.sum('is_outlier').alias('n_outliers'), F.collect_list('distance').alias('distances')) .withColumn(colName='buckets', col=generate_list_udf( 'distances', 'min', 'max', 'computed_boundary')) ) return tmp.select(prediction_col, 'buckets')
def main(spark, test_file, index_file, model_file): # load test data and create dataframe test_df = spark.read.parquet(test_file) model_indexer = PipelineModel.load(index_file) # transform user and track ids for test data test_df = model_indexer.transform(test_df) # store ground truth for user user_truth = test_df.groupby('user_label').agg( F.collect_list('book_label').alias('truth')) print('created ground truth df') als_model = ALSModel.load(model_file) # predict based on the top 500 item of each user recommend = als_model.recommendForAllUsers(500) print('recommendation has been created.') # RMSE predict = als_model.transform(test_df) evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction') rmse = evaluator.evaluate(predict) print('Root mean square error is ' + str(rmse)) # prediction = spark.sql('SELECT * FROM recommend INNER JOIN user_truth WHERE recommend.user_label=user_truth.user_label') # after running panda udf is faster than using sparksql prediction = recommend.join(user_truth, recommend.user_label == user_truth.user_label, 'inner') score = prediction.select('recommendations.book_label', 'truth').rdd.map(tuple) rank_metric = RankingMetrics(score) precision = rank_metric.precisionAt(500) mean_precision = rank_metric.meanAveragePrecision print(' precision at 500 ' + str(precision) + 'mean average precision of ' + str(mean_precision))
def task_a_1_step_1_final(spark): a1_struct = T.StructType([ T.StructField("datetime_start", T.TimestampType()), T.StructField("datetime_end", T.TimestampType()), T.StructField("map_item", T.MapType(T.StringType(), T.IntegerType())), ]) result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "topics-by-country_step-0").parse_json(a1_struct) \ .withWatermark("datetime_end", "1 minute").groupBy( F.window("datetime_end", "6 hour", "1 hour") ).agg( F.first("window.start").alias("timestamp_start"), F.first("window.end").alias("timestamp_end"), F.collect_list("map_item").alias("statistics") ).select( F.struct( F.concat(F.hour('timestamp_start'), lit(":"), F.minute('timestamp_start')).alias("time_start"), F.concat(F.hour('timestamp_end'), lit(":"), F.minute('timestamp_end')).alias("time_end"), col('timestamp_end').alias("time_end"), sum_maps_udf(col('statistics')).alias("statistics") ).alias("res") ).send_to_kafka(config.BOOTSTRAP_SERVERS, "topics-by-country", config.LOG_PREFIX) return result
def remove_duplicated_paths(paths, trips): """ Given a paths dataframe, it will remove the duplicated paths in `paths` and update the `trips` with the non-duplicated paths """ old_paths_count = paths.count() old_trips_count = trips.count() # Group the paths that are the same similar_paths = paths.groupBy("path_latitudes", "path_longitudes").agg( F.min("path_id").alias("new_path_id"), F.collect_list("path_id").alias("old_path_ids"), ) # Make a map of the old path IDs to the new path IDs # NOTE: the exploded column name is called 'col' path_id_mappings = similar_paths.select("new_path_id", F.explode("old_path_ids")) # Update the trips' reference to path ids trips = trips.join( path_id_mappings, trips.path_id == path_id_mappings.col, how="leftouter" ) trips = trips.withColumn("path_id", trips.new_path_id) # Grab the unique paths unique_paths = ( similar_paths.select("new_path_id", "path_latitudes", "path_longitudes",) .withColumnRenamed("new_path_id", "path_id") .drop("new_path_id") ) new_paths_count = unique_paths.count() new_trips_count = trips.count() assert new_paths_count <= old_paths_count and new_trips_count == old_trips_count return unique_paths, trips
def get_top_k_items( dataframe, col_user="******", col_item="itemID", col_rating="rating", k=10 ): """Get the input customer-item-rating tuple in the format of Spark DataFrame, output a Spark DataFrame in the dense format of top k items for each user. NOTE: if it is implicit rating, just append a column of constants to be ratings. Args: dataframe (spark.DataFrame): DataFrame of rating data (in the format of customerID-itemID-rating tuple). col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. k (int): number of items for each user. Return: spark.DataFrame: DataFrame of top k items for each user. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc()) # this does not work for rating of the same value. items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .withColumn("prediction", F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, "prediction") .dropDuplicates([col_user, "prediction"]) ) return items_for_user
def top_5_selling_each_day_pyspark(self): """ returns top 5 selling products for each day using pyspark syntax :return: """ window = Window.partitionBy('date').orderBy( F.col('total_amount').desc()) tmp = \ self.df\ .groupBy([F.to_date(F.col('timestamp')).alias('date'), 'ID']) \ .agg(F.sum('quantity').alias('total_amount')) \ .orderBy('date', accending=False) \ .withColumn('row_number', F.row_number().over(window)) \ .filter(F.col('row_number') <= 5) \ .withColumn('zipped', F.concat("ID", F.lit(':'), "total_amount")) \ .groupBy('date').agg(F.collect_list('zipped').alias("product_id:number_sold")) \ .orderBy('date') \ .show(50, truncate=False) return tmp
def get_n_freq_str(df_counts, top_n: Union[DataFrame, Column]) -> DataFrame: """ extract n frequent values from get_counts and collect to a string representation so that each column has 1 row mapped to a string with the 10 frequent values and their counts """ conc = udf(lambda lst: "{}->{}".format(str(lst[0]), str(lst[1])), StringType()) # take top n mapped_counts = df_counts.select( 'col_name', conc(struct('frequent_value', 'count')).alias('frequent_value->count')) # mapped_counts = df_counts.select('col_name', create_map(['frequent_value', 'count']).alias('frequent_value->count')) # to collect to map df_output = mapped_counts.groupBy('col_name').agg( collect_list(spark_col('frequent_value->count')).alias( 'frequent_values->counts')) freq_col_name = 'frequent_values->counts_(top_{:d})'.format(top_n) conv = udf(lambda lst: [str(dct) for dct in lst[:top_n]], ArrayType(StringType())) # take top n df_output = df_output.select( 'col_name', conv('frequent_values->counts').alias(freq_col_name)) return df_output
def create_normalization_spec_spark(df, column, num_samples: int, seed: Optional[int] = None): """Returns approximately num_samples random rows from column of df.""" # assumes column has a type of map df = df.select( explode(col(column).alias("features")).alias("feature_name", "feature_value")) # calculate fractions counts_df = df.groupBy("feature_name").count() frac = {} for row in counts_df.collect(): assert num_samples <= row["count"] frac[row["feature_name"]] = num_samples / row["count"] # TODO(T64843081): change to reservoir sampling, currently it approximates # perform sampling and collect them df = df.sampleBy("feature_name", fractions=frac, seed=seed) df = df.groupBy("feature_name").agg( collect_list("feature_value").alias("feature_values")) return df
def user_personalization(df_similarity,df_active): df_pop=user_item_popularity(df_active) df_tmp=user_item_similarities(df_pop,df_similarity) df_save=df_tmp.select("user_id","_id","score")\ .withColumn("Recommendations", F.struct(F.col("_id"), F.col("score")))\ .select("user_id","Recommendations")\ .groupby("user_id").agg(F.collect_list("Recommendations").alias("Recommendations")) start_time = time.time() df_save.repartition(1) \ .write.format("json") \ .mode("overwrite") \ .option("header","true")\ .save("userrr_2") print("Saving user personalization json took - ", (time.time() - start_time) )
def top20_channels(data): result = {"channels": []} channels = data.groupBy(["channel_title"]) \ .agg(collect_list(array('video_id', 'trending_date', "views")).alias("trending_days"), countDistinct("video_id"), sum("views").alias("all_views")).orderBy("all_views", ascending=False).head(20) i = 0 for row in channels: result["channels"].append({ "channel_name": row["channel_title"], "start_date": row["trending_days"][0][1], "end_date": row["trending_days"][-1][1], "total_view": row["all_views"], "videos_views": [] }) for video in row["trending_days"]: result["channels"][i]["videos_views"].append({ "video_id": video[0], "views": video[2] }) i += 1 return result
def process_df(self, df): def detect_anomaly(ts): """ Args ts: pandas.series rtype: int """ outliers_indices = seasonal_esd( ts, hybrid=True, max_anomalies=10) return len(outliers_indices) grouped_df = df.groupBy(["id"]).agg(F.collect_list("downsample_avg").alias( "downsampled_ts"), first("start_ts").alias("start_ts"), last("end_ts").alias("end_ts")) anomaly_udf = udf(detect_anomaly, IntegerType()) processed_df = grouped_df.withColumn("num_anomaly", anomaly_udf( "downsampled_avg")).sort(desc("num_anomaly")) final_df = processed_df.select( "id", "start_ts", "end_ts", "num_anomaly") try: connector = pgConnector.PostgresConnector( "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb", "datanode", "password") connector.write(final_df, "global_anomalies_table", "append") except Exception as e: print(e) pass
def run(spark, config, **kwargs): df = read_tidb(spark, config, "select id, title, mp_id from content.content") df = df.withColumn('words', jieba_seg_udf(df['title'])) # df.show(10) model = Word2Vec(numPartitions=10, inputCol='words', outputCol='vecs', seed=42).fit(df) df_transformed = model.transform(df) df_cross = df_transformed.select( col('id').alias('id1'), col('vecs').alias('vecs1')).crossJoin(df_transformed.select( col('id').alias('id2'), col('vecs').alias('vecs2')) ) df_cross = df_cross.withColumn('sim', sim(df_cross['vecs1'], df_cross['vecs2'])) df_simi = df_cross.filter(col('sim')<1) df_simi_top10 = get_topN(df_simi, col('id1'), col('sim'), n=10) df_simi_top10.show() df_simi_top10.groupby("id1").agg(collect_list("id2").alias("sim_ids")).show(10) return
def main(spark, val_file, model_file): model = ALSModel.load(model_file) print('finish loading models') val_df = spark.read.parquet(val_file) val_df = val_df.select('user_label', 'track_label') val_grouped = val_df.groupBy('user_label').agg(F.collect_list(F.col('track_label')).alias('track_label')) print('Finish preparing test data') val_grouped.cache() predictions = model.recommendForAllUsers(500) print('finish making predictions') prediction_df = predictions.rdd.map(lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF() prediction_df = prediction_df.selectExpr("_1 as user_label", "_2 as recommendations") # Join table val_pred = val_grouped.join(prediction_df, "user_label", "inner") print('finish joining data') # Instantiate regression metrics to compare predicted and actual ratings rdd = val_pred.select('recommendations', 'track_label').rdd print('final steps') ranking_metrics = RankingMetrics(rdd) # MAP print("MAP = %s" % ranking_metrics.meanAveragePrecision)
def _get_relevant_items_by_threshold(dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, threshold=DEFAULT_THRESHOLD): """Get relevant items for each customer in the input rating data. Relevant items are defined as those having ratings above certain threshold. The threshold is defined as a statistical measure of the ratings for a user, e.g., median. Args: dataframe: Spark DataFrame of customerID-itemID-rating tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. threshold (float): threshold for determining the relevant recommended items. This is used for the case that predicted ratings follow a known distribution. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ items_for_user = (dataframe.orderBy( col_rating, ascending=False).where(col_rating + " >= " + str(threshold)).select( col_user, col_item, col_rating).withColumn( col_prediction, F.collect_list(col_item).over( Window.partitionBy(col_user))).select( col_user, col_prediction).dropDuplicates()) return items_for_user
def group_batched_logs(df_logs_batched): # group the logs to generate the train ready data from the basic unit of uckey + interval_time + keyword. # group 1: group by uckey + interval_starting_time + keyword df = df_logs_batched.groupBy('uckey', 'interval_starting_time', 'keyword_index').agg( first('keyword').alias('keyword'), fn.sum(col('is_click')).alias('keyword_click_count'), fn.count(fn.when(col('is_click') == 0, 1).otherwise( 0)).alias('keyword_show_count') ) df = df.withColumn('keyword_index_click_count', concat_ws(":", col('keyword_index'), col('keyword_click_count'))) df = df.withColumn('keyword_index_show_count', concat_ws(":", col('keyword_index'), col('keyword_show_count'))) df = df.withColumn('keyword_click_count', concat_ws(":", col('keyword'), col('keyword_click_count'))) df = df.withColumn('keyword_show_count', concat_ws(":", col('keyword'), col('keyword_show_count'))) # group 2: group by uckey + interval_starting_time df = df.groupBy('uckey', 'interval_starting_time').agg( concat_ws(",", collect_list('keyword_index')).alias('interval_keyword_indexes'), concat_ws(",", collect_list('keyword_index_click_count')).alias('interval_keyword_indexes_click_counts'), concat_ws(",", collect_list('keyword_index_show_count')).alias('interval_keyword_indexes_show_counts'), concat_ws(",", collect_list('keyword')).alias('interval_keywords'), concat_ws(",", collect_list('keyword_click_count')).alias('interval_keywords_click_counts'), concat_ws(",", collect_list('keyword_show_count')).alias('interval_keywords_show_counts') ) return df
def run(self): """ Run application :return: """ join_df = self.join_df self.persist() # Create dataframes for intersection , cogo labs and liveworks only dataframes logger.info( " Creating common dataframe emd5 present both in live works and cogo labs" ) """ Cogo labs emd5 Name 1 Sam 2 Henry Liveworks emd5 Name 2 John 3 Smith Full Outer Join c_emd5 l_emd5 c_name l_name 1 Null Sam Null 2 2 Henry John Null 3 Null Smith Intersection from cogo labs and Live works, where c_emd5 and l_emd5 is not null c_emd5 l_emd5 c_name l_name 2 2 Henry John Users only from cogo labs, where c_emd5 is not null and l_emd5 is null c_emd5 l_emd5 c_name l_name 1 Null Sam Null Users only from live works, where l_emd is not null and c_emd5 is null c_emd5 l_emd5 c_name l_name Null 3 Null Smith """ common_df = join_df.filter(~join_df.emd5.isNull() & ~join_df.cogo_emd5.isNull()) logger.info( " Creating cogo labs only dataframe where emd5 present in cogo labs and not present in live works" ) cogo_labs_only_df = join_df.filter(~join_df.cogo_emd5.isNull() & join_df.emd5.isNull()) logger.info( " Creating live works only dataframe where emd5 present in liveworks and not present in cogo labs" ) live_works_only_df = join_df.filter(join_df.cogo_emd5.isNull() & ~join_df.emd5.isNull()) # counting distinct emd5 counts intersection_count = common_df.select( common_df.cogo_emd5).distinct().count() cogo_labs_only_count = cogo_labs_only_df.select( cogo_labs_only_df.cogo_emd5).distinct().count() live_works_only_count = live_works_only_df.select( live_works_only_df.emd5).distinct().count() logger.info( "Number of Unique users present in both cogo labs and liveworks %s", intersection_count) logger.info("Number of Unique users present only in cogo labs data %s", cogo_labs_only_count) logger.info( "Number of Unique users present only in live works data %s", live_works_only_count) # Create common job data frame with users having same job title logger.info( "Creating common job data frame where common users have same job title" ) common_job_df = common_df.where(common_df.cogo_job == common_df.job) common_job_df.persist(StorageLevel.DISK_ONLY) print("Output with common emd5 users having same job title") common_job_df.show() common_job_count = common_job_df.count() # Calculate percentage common emd5 users have different job titles different_jobs_percent = ( (intersection_count - common_job_count) / intersection_count) * 100 logger.info( "Number of users with common job present in both cogo labs and liveworks %s", common_job_df.count()) logger.info("Percent have different job titles in intersection %s", different_jobs_percent) # jsonsify data from common data frame """ Create Key:Value pair Key = Job title , Value = Company Name cogolabs_emd5 cogolabs_job cogolabs_company liveworks_job liveworks_company 1 Hotel manager Bender PLC Barrister Brown PLC 1 Immigration officer Diaz Ltd cogolabs_emd5 cogo_labs_c liveworks_c 1 {"Hotel manager":"Bender PLC"} {"Barrister":"Brown PLC"} 1 {"Immigration officer":"Diaz Ltd"} """ common_json_df = common_df.withColumn("live_works_c", concat(lit("{\""), common_df.job, lit("\":\""), common_df.company, lit("\"}"))) \ .withColumn("cogo_labs_c", concat(lit("{\""), common_df.cogo_job, lit("\":\""), common_df.cogo_company, lit("\"}"))) """ Concatenate results to form Array of key value pairs group by emd5 emd5 cogolabs_json liveworks_json 1 [{"Hotel manager":"Bender PLC"},{"Immigration officer":"Diaz Ltd"}] [{"Barrister":"Brown PLC"}] """ common_agg_df = common_json_df.select(common_json_df.emd5, common_json_df.cogo_labs_c, common_json_df.live_works_c) \ .groupBy(common_json_df.emd5).agg(concat_ws(",", collect_list(common_json_df.cogo_labs_c)), concat_ws(",", collect_list(common_json_df.live_works_c))) \ .withColumnRenamed("concat_ws(,, collect_list(cogo_labs_c))", "cogo_labs_json") \ .withColumnRenamed("concat_ws(,, collect_list(live_works_c))", "live_works_json") final_df = common_agg_df.withColumn("cogo_labs_json", concat(lit("["), common_agg_df.cogo_labs_json, lit("]"))) \ .withColumn("live_works_json", concat(lit("["), common_agg_df.live_works_json, lit("]"))) print("Final output with emd5 , cogolabs json and liveworks json") final_df.persist(StorageLevel.DISK_ONLY) final_df.show() # save final output as csv if os.path.exists("data/final_output/"): remove_directory("data/final_output/") logger.info("Save final output as csv") final_df.repartition(1).write.format("csv").save( os.path.join(DATA_PATH, "final_output")) self.unpersist()
HashTagsTable = extracted_SQL_table.select("created_at", explode("hashtags")) HashTagsTable_WithDates = HashTagsTable.withColumn( 'Keyword', get_hash_tag('col')).withColumn('Time', tweet_time('created_at')) # clean up table columns_to_drop = ['created_at', 'col'] HashTagsTable_WithDates = HashTagsTable_WithDates.drop(*columns_to_drop) #HashTagsTable_WithDates.show() to_mongo = HashTagsTable_WithDates.groupBy('Keyword', 'Time').count() groupby_result = (to_mongo.groupBy("Keyword").agg( collect_list(struct("Time", "count")).alias('occurances'))) # loading to REDIS redis_host = config_dict['redis_path'] redis_port = config_dict[r] redis_password = "" r = redis.StrictRedis(host=redis_host, port=redis_port, decode_responses=True) for row in groupby_result.rdd.collect(): r.set(row.Keyword, json.dumps(row.occurances)) # set TTL : expire in 60 seconds * 60 min * 24 hours = 86400 seconds r.expire(row.Keyword, 86400) spark.stop()
argparser.add_argument('outdir', help='Output directory') args = argparser.parse_args() spark = SparkSession.builder.appName('Cluster Features').getOrCreate() df = spark.read.load(args.indir) raw = df.filter(col('size') >= args.clusterSize) \ .select('cluster', 'size', regexp_replace('text', u'\xad\s*', '').alias('text')) raw.cache() tok = RegexTokenizer(inputCol='text', outputCol='terms', gaps=False, pattern='\w+') \ .transform(raw) counts = CountVectorizer(inputCol='terms', outputCol='counts', minDF=2.0) \ .fit(tok).transform(tok) mergeCounts = udf(lambda va, size: threshold_sparse(scale_sparse(reduce(add_sparse, va), 1.0/size), args.minCount), VectorUDT()) res = counts.groupBy('cluster', 'size') \ .agg(mergeCounts(collect_list('counts'), 'size').alias('counts')) # lda = LDA(k=2, featuresCol='counts', seed=1, optimizer='em') # model = lda.fit(res) # model.describeTopics().write.json(args.outdir) res.write.json(args.outdir) spark.stop()
def prepareDatasets(sc, spark): buisHeader = ['business_id', 'name', 'neighborhood', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories'] buis = sc.textFile(datapath+'yelp_business.csv', use_unicode=False) buis = buis.filter(lambda row: not row.startswith('business_id,name'))\ .map(lambda row: re.findall(r'(?:[^,"]|"(?:\\.|[^"])*")+', row.replace(',,', ', ,')))\ .map(lambda row: map(lambda x: x.replace('"', ''), row))\ .map(lambda row: dict(zip(buisHeader, row)))\ .filter(lambda row: row['business_id'] and row['longitude'] and row['latitude'])\ .filter(lambda row: row['business_id'].strip() and row['longitude'].strip() and row['latitude'].strip())\ .toDF() buis = buis.select('business_id', 'name', 'city', 'state', 'postal_code', 'categories', buis['latitude'].cast('float'), buis['longitude'].cast('float'), buis['stars'].cast('float'), buis['review_count'].cast('int'), buis['is_open'].cast('int'))\ .dropna(how='any', subset=['business_id','longitude', 'latitude']) def reviews_mapper(index, lines): import csv reader = csv.reader(lines) if index==0: lines.next() for row in reader: if len(row) == 9 and len(row[1])==22: yield row reviewsHeader = ["review_id","user_id","business_id","stars","date","text","useful","funny","cool"] reviews = sc.textFile(datapath+'yelp_review.csv', use_unicode=False)\ .mapPartitionsWithIndex(reviews_mapper)\ .map(lambda x: dict(zip(reviewsHeader, x)))\ .toDF() reviews = reviews.select( "review_id", "user_id", "business_id", "text", reviews["stars"].cast('float'), reviews["date"].cast('date'), reviews["useful"].cast('int'), reviews["funny"].cast('int'), reviews["cool"].cast('int'))\ .filter(reviews.text.isNotNull())\ .filter(reviews.business_id.isNotNull()) reviews = reviews.alias('a').join(buis.alias('b'), sf.col('b.business_id') == sf.col('a.business_id'))\ .select('b.*','a.text') #,'a.user_id') reviews = reviews.where( 'longitude > {:f} and longitude < {:f} and latitude > {:f} and latitude < {:f}'\ .format(westAMER, eastAMER, southAMER, northAMER) ).cache() id_text = reviews.select('business_id', 'text')\ .groupBy('business_id').agg(sf.concat_ws(' ', sf.collect_list("text")).alias('text_concat')) reviews = reviews.drop(reviews.text)\ .select('business_id','categories','state', 'stars')\ .alias('a').join(id_text.alias('b'), sf.col('b.business_id') == sf.col('a.business_id'))\ .select('a.*','b.text_concat')\ .distinct()\ .withColumnRenamed('text_concat', 'text') # some data cleansing: reviews = reviews.withColumn('text', sf.regexp_replace(reviews.text, '\\/', '/')) def cleanse(text): re_punc = re.compile('[' + re.escape(punctuation) + '0-9\\n\\t\\r]') re_spc = re.compile('[ ]+') # get rid of extra spaces return re_spc.sub(' ', re_punc.sub(" ", text)) cleanser = sf.udf(lambda x: cleanse(x)) reviews = reviews.withColumn('text', cleanser('text')) # tokinizing and removing stop words: import pyspark.ml.feature as sparkml from pyspark.ml import Pipeline tokenizer = sparkml.Tokenizer(inputCol="text", outputCol="words") swremover = sparkml.StopWordsRemover(inputCol='words', outputCol='words_clean') pipeline = Pipeline(stages=[tokenizer, swremover]) reviews = pipeline.fit(reviews).transform(reviews) reviews = reviews.drop('text', 'words') return reviews.cache()
sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # Import Data data = sc.textFile("hdfs://sandbox.hortonworks.com:/tmp/household_power_consumption_uci_edu.txt") header = data.first().split(';') data = data.map(lambda x: x.split(';')).filter(lambda x: x!=header) df = data.toDF(header) # Use schema df1 = df.withColumn('Sub_metering_1',df.Sub_metering_1.cast('float')) \ .withColumn('Sub_metering_2',df.Sub_metering_2.cast('float')) \ .withColumn('Sub_metering_3',df.Sub_metering_3.cast('float')) \ .withColumn('Voltage',df.Voltage.cast('float')) df1.show(5) df2 = df1.groupby("Date").agg(F.collect_list("Sub_metering_1").alias('meter1'),F.collect_list("Sub_metering_2").alias('meter2'),F.collect_list("Sub_metering_3").alias('meter3')) df2.show(5) def get_udf_distance(array1, array2): distance, path = fastdtw(array1, array2, dist=euclidean) return distance udf_dtw = udf(get_udf_distance , FloatType()) df3 = df2.select('Date', udf_dtw(df2.meter1, df2.meter2).alias('dtw_distance (meter1-meter2)')) df3.show()
attributes_path = '../input/attributes.csv' spark = SparkSession.builder.getOrCreate() df = spark.read.csv(data_path, header=True) #descr_df = spark.read.csv(description_path, # header=True) attr_df = spark.read.csv(attributes_path, header=True) #df = df.join(descr_df, ['product_uid']) #attr_df = attr_df.groupBy('product_uid').agg(f.concat_ws(" ", f.collect_list(attr_df.name))).show() attr_df = attr_df.select('product_uid', 'value').groupBy('product_uid').agg( concat_ws(' ', collect_list('value')).alias('attribute_names')) df = df.join(attr_df, ['product_uid']) df.show() df = df.withColumn('new_relevance', df['relevance'].cast(FloatType())) replaceDigitsUdf = udf(replaceDigits, StringType()) createClassLabelUdf = udf(createClassLabel, IntegerType()) df = df.withColumn('label', createClassLabelUdf('new_relevance')) df = df.withColumn('nodigits_title', replaceDigitsUdf('product_title')) df = df.withColumn('nodigits_sterm', replaceDigitsUdf('search_term'))
def run_spark_job(sc, run_N_times = 1): ############################################################################################################### ## utility functions: ############################################################################################################### def print_df(df, only_display=100): outout = df.collect() for idx, o in enumerate(outout): if idx > only_display: print(' And more ...') break print(o) # print spark configurations def printSparkConfigurations(): c = SparkConf() print("Spark configurations: {}".format(c.getAll())) # Use udf to define a row-at-a-time udf def myFunc(data_list): s = set() for d in data_list: s.add(d) count = len(s) output_str = reduce(lambda x,y: x+', '+y, s) return "{} {}".format(count, output_str) def read_csv_in_chunk(input_file): def optimize_chunk(chunk, column_names): for n in column_names: chunk[n] = chunk[n].astype('category') return chunk df = pd.DataFrame() chunk_size = 50000 for idx, chunk in enumerate(pd.read_csv(input_file, sep=",", delimiter=",", chunksize=chunk_size)): # assuming the file contains a header print("Reading the csv input in chunk index: {} on chunk_size: {} ".format(idx, chunk_size)) # optimize the rows by changing it to category type chunk = optimize_chunk(chunk, ['trnk_wire', 'trnk_light', 'trnk_other', 'state', 'root_grate', 'root_other', 'root_stone', 'sidewalk', 'health', 'curb_loc', 'brnch_ligh', 'brnch_othe', 'brnch_shoe', 'guards', 'user_type', 'status', 'steward', 'boroname', 'zip_city']) df = pd.concat([df, chunk], ignore_index=True) return df printSparkConfigurations() input_file = './2015StreetTreesCensus_TREES.csv' # input_file = './2015StreetTreesCensus_TREES_small.csv' pandas_df_raw = read_csv_in_chunk(input_file) for i in range(run_N_times): pandas_df = pandas_df_raw.replace(np.nan, '', regex=True) # there are some empty columns (internally interpreted as nan inside dataframe) in the file that caused exceptions so need to map it to an empty string pandas_df = pandas_df[['created_at', 'block_id', 'spc_common']] pandas_df = sc.createDataFrame(pandas_df) pandas_df.repartition(5000, 'created_at') # optimization trick ! myUdf = udf(myFunc, StringType()) created_at_block_id_summaries = pandas_df.groupby(['created_at', 'block_id']).agg(collect_list('spc_common').alias('spc_common')).withColumn('spc_common', myUdf('spc_common')) print_df(created_at_block_id_summaries) block_id_created_at_summaries = pandas_df.groupby(['block_id', 'created_at']).agg(collect_list('spc_common').alias('spc_common')).withColumn('spc_common', myUdf('spc_common')) print_df(block_id_created_at_summaries)
("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100) ] schema = ["employee_name", "department", "salary"] df = spark.createDataFrame(data=simpleData, schema = schema) df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0])) print("count: "+str(df.select(count("salary")).collect()[0])) df.select(first("salary")).show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False)
udf_cos_sim = udf(lambda cell: float(ref_vec.dot(cell) / (ref_vec.norm(2) * cell.norm(2))), FloatType()) df_transformed \ .withColumn('cos_similarity', udf_cos_sim(df_transformed.word2vec_large)) \ .sort(col('cos_similarity'), ascending=False) \ .select('resource_id', 'cos_similarity') .show(10) # create a list of terms connected to their stems df_wordcount = spark \ .createDataFrame(df_transformed.select(explode(df_transformed.tokens_stop).alias('term')) \ .groupBy('term') \ .agg({"*": "count"}) \ .collect()) df_stems = df_wordcount \ .withColumn('stem', udf(lambda term: opinion_stemm.stem(term), StringType())(col('term'))) \ .groupBy('stem') \ .agg(collect_list('term').alias('terms')) df_stems \ .select('terms') \ .filter(df_stems.stem == opinion_stemm.stem('artful')) \ .first()[0] # create a count for each opinion of the number of times it has been cited by other Washington opinions df_citecount = spark.createDataFrame( df_transformed.select(explode(df_transformed.opinions_cited).alias('cites')) \ .groupBy('cites') \ .count() \ .collect()) df_citecount.orderBy('count', ascending=False).show()
less_than_week_funds = fund_count.filter(fund_count['fund_count'] <= 5) less_than_week_funds = less_than_week_funds.select( f.collect_set('fund_id')).collect()[0][0] less_than_week_funds = [ str(less_than_week_funds) for less_than_week_funds in less_than_week_funds ] #df_uniqTicker30 = df_uniqTicker.where(df_uniqTicker['ticker'].isin(df_Ticker30['ticker'])) df_less = df_uniqfund.where(df_uniqfund.fund_id.isin(less_than_week_funds)) #df_less = df_uniqTicker30 df_less_quantiles = df_less.groupby('client_name', 'fund_id').agg( f.collect_list("event_time_numeric").alias("lis_2_event_time_numeric")) df_less_quantiles = df_less_quantiles.withColumn( 'Quantile01(fund_MinimumTime)', udf_percent_rank('lis_2_event_time_numeric', f.lit(1))) df_less_quantiles = df_less_quantiles.withColumn( 'Predicted_Time', udf_percent_rank('lis_2_event_time_numeric', f.lit(50))) #df_less_quantiles = df_less_quantiles.withColumn('Quantile01(Ticker_MinimumTime)', df_less_quantiles['Quantile01(Ticker_MinimumTime)'].cast(IntegerType())) #df_less_quantiles = df_less_quantiles.withColumn('Predicted_Time',df_less_quantiles['Predicted_Time'].cast(IntegerType())) df_less_quantiles = df_less_quantiles.withColumn( 'Quantile01(fund_MinimumTime)', udf_string_time('Quantile01(fund_MinimumTime)')) df_less_quantiles = df_less_quantiles.withColumn( 'Predicted_Time', udf_string_time('Predicted_Time'))
from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import collect_set, collect_list df.agg(collect_set("Country"), collect_list("Country")).show() # COMMAND ---------- from pyspark.sql.functions import count df.groupBy("InvoiceNo").agg( count("Quantity").alias("quan"), expr("count(Quantity)")).show() # COMMAND ---------- df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"),expr("stddev_pop(Quantity)"))\ .show()