def append_ranks(self, hour_df, day_df, month_df, year_df): """ Ranks based on pageviews within categories. For hour_df the category is a single hour, for day_df the category is a single day, and so on. Args :param hour_df: Dataframe with hourly aggregated data. :param day_df: Dataframe with daily aggregated data. :param month_df: Dataframe with monthly aggregated data. :param year_df: Dataframe with yearly aggregated data. Returns: :return: hour_df, day_df, month_df, year_df with ranks added. """ #Specify windows to rank within. hour_window = Window.partitionBy(['year', 'month', 'day', 'hour']).orderBy(hour_df['pageviews'].desc()) day_window = Window.partitionBy(['year', 'month', 'day']).orderBy(day_df['pageviews'].desc()) month_window = Window.partitionBy(['year', 'month']).orderBy(month_df['pageviews'].desc()) year_window = Window.partitionBy(['year']).orderBy(year_df['pageviews'].desc()) #Create ranking functions hour_rank = dense_rank().over(hour_window) day_rank = dense_rank().over(day_window) month_rank = dense_rank().over(month_window) year_rank = dense_rank().over(year_window) #Apply the ranking functions to make new columns hour_df = hour_df.withColumn("hour_rank", hour_rank) day_df = day_df.withColumn("day_rank", day_rank) month_df = month_df.withColumn("month_rank", month_rank) year_df = year_df.withColumn("year_rank", year_rank) return hour_df, day_df, month_df, year_df
def gen_report_table(hc,curUnixDay): rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6]))) HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor") #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_indoor order by entityid,clientmac,utoday" df_id_stat=hc.sql(sql) df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_id_mm df_min_max ,to caculate firtarrival and last arrival df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY] df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon) df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth") #newly added part for indoors7 and indoors30 based on current date df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0)) df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0)) df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30") hc.sql("drop table if exists df_indoor_fin") df_indoor_fin.write.saveAsTable("df_indoor_fin") rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5]))) HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow") # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday sql="select entityid,clientmac,utoday,UFIRSTDAY," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_flow order by entityid,clientmac,utoday" df_fl_stat=hc.sql(sql) df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_fl_mm df_min_max ,to caculate firtarrival and last arrival df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY] df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon) df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth") hc.sql("drop table if exists df_flow_fin") df_flow_fin.write.saveAsTable("df_flow_fin")
def process_file(date_update): """Process downloaded MEDLINE folder to parquet file""" print("Process MEDLINE file to parquet") # remove if folder still exist if glob(os.path.join(save_dir, 'medline_*.parquet')): subprocess.call(['rm', '-rf', 'medline_*.parquet']) date_update_str = date_update.strftime("%Y_%m_%d") path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000) parse_results_rdd = path_rdd.\ flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict) for publication_dict in pp.parse_medline_xml(x)]) medline_df = parse_results_rdd.toDF() medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str), mode='overwrite') window = Window.partitionBy(['pmid']).orderBy(desc('file_name')) windowed_df = medline_df.select( max('delete').over(window).alias('is_deleted'), rank().over(window).alias('pos'), '*') windowed_df.\ where('is_deleted = False and pos = 1').\ write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str), mode='overwrite') # parse grant database parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\ .filter(lambda x: x is not None)\ .map(lambda x: Row(**x)) grant_df = parse_grant_rdd.toDF() grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str), mode='overwrite')
def spark_timestamp_split( data, ratio=0.75, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Spark timestamp based splitter The splitter splits the data into sets by timestamps without stratification on either user or item. The ratios are applied on the timestamp column which is divided accordingly into several partitions. Args: data (spark.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two sets and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. Earlier indexed splits will have earlier times (e.g the latest time in split[0] <= the earliest time in split[1]) col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Float number represented in seconds since Epoch. Returns: list: Splits of the input data as spark.DataFrame. """ multi_split, ratio = process_split_ratio(ratio) ratio = ratio if multi_split else [ratio, 1 - ratio] ratio_index = np.cumsum(ratio) window_spec = Window.orderBy(col(col_timestamp)) rating = data.withColumn("rank", row_number().over(window_spec)) data_count = rating.count() rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count) splits = [] for i, _ in enumerate(ratio_index): if i == 0: rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop( "rank" ) else: rating_split = rating_rank.filter( (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) ).drop("rank") splits.append(rating_split) return splits
def _get_relevant_items_by_timestamp( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, col_prediction=DEFAULT_PREDICTION_COL, k=DEFAULT_K ): """Get relevant items for each customer defined by timestamp. Relevant items are defined as k items that appear mostly recently according to timestamps. Args: dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_timestamp (str): column name for timestamp. col_prediction (str): column name for prediction. k: number of relevent items to be filtered by the function. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc()) items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, col_prediction) .dropDuplicates([col_user, col_prediction]) ) return items_for_user
def _get_relevant_items_by_threshold( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, threshold=DEFAULT_THRESHOLD ): """Get relevant items for each customer in the input rating data. Relevant items are defined as those having ratings above certain threshold. The threshold is defined as a statistical measure of the ratings for a user, e.g., median. Args: dataframe: Spark DataFrame of customerID-itemID-rating tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. threshold (float): threshold for determining the relevant recommended items. This is used for the case that predicted ratings follow a known distribution. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ items_for_user = ( dataframe .orderBy(col_rating, ascending=False) .where(col_rating + " >= " + str(threshold)) .select( col_user, col_item, col_rating ) .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, col_prediction) .dropDuplicates() ) return items_for_user
def _get_top_k_items( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, k=DEFAULT_K ): """Get the input customer-item-rating tuple in the format of Spark DataFrame, output a Spark DataFrame in the dense format of top k items for each user. NOTE: if it is implicit rating, just append a column of constants to be ratings. Args: dataframe (spark.DataFrame): DataFrame of rating data (in the format of customerID-itemID-rating tuple). col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. k (int): number of items for each user. Return: spark.DataFrame: DataFrame of top k items for each user. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc()) # this does not work for rating of the same value. items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .groupby(col_user) .agg(F.collect_list(col_item).alias(col_prediction)) ) return items_for_user
def add_recent_encounters(encounter_features): lowest_date = (encounter_features.select('START').orderBy('START').limit( 1).withColumnRenamed('START', 'EARLIEST_TIME')) output_df = (encounter_features.crossJoin(lowest_date).withColumn( "day", datediff(col('START'), col('EARLIEST_TIME'))).withColumn( "patient_age", datediff(col('START'), col('BIRTHDATE')))) w = (Window.orderBy(output_df['day']).partitionBy( output_df['PATIENT']).rangeBetween(-int(num_days), -1)) for comorbidity_idx in range(num_conditions): col_name = "recent_%d" % comorbidity_idx output_df = (output_df.withColumn( col_name, sum(col("comorbidity_%d" % comorbidity_idx)).over(w)).withColumn( col_name, coalesce(col(col_name), lit(0)))) return (output_df)
def fill_missing_values_forward(sdf, ordering_col, cols_to_fill, *id_cols): """ Fill missing values by carrying previous values forward. Args: sdf: a Spark DataFrame ordering_col: column by which rows should be sorted cols_to_fill: list of columns where missing values will be filled id_cols: list of columns that collectively form a unique identifier that can be used to partition cases. """ lookback_window = Window.partitionBy(*id_cols)\ .orderBy(ordering_col)\ .rowsBetween( Window.unboundedPreceding, 0) for ctf in cols_to_fill: filled_col = ctf + "_filled" sdf = sdf.withColumn( filled_col, fn.last(sdf[ctf], ignorenulls=True).over(lookback_window)) return sdf
def converting_file_into_chunks(self, df, chunk_size=100): """ Large pyspark dataframe is converted into chunks and converted to pandas to convert it into pandas df :param df: orig dataframe :param chunk_size: size of each dataframe chunk :return: """ # created window using first column window = Window.orderBy(funct.col(df.columns[0])) df = df.withColumn('row_number', funct.row_number().over(window)) for i in range(0, df.count(), chunk_size): chunk = df.where((funct.col('row_number') >= i) & (funct.col('row_number') < (i + chunk_size))) logger.info(chunk.count()) if chunk.count() != 0: pd_df = chunk.toPandas() ### you can do what ever you want to with the pandas ddataframe pd_df.to_csv("{}_file.csv".format(i)) print("############")
def etl_load_quote(self): logging.debug("Inside transform parking occupancy dataset module") quote_common = self.spark.read.\ parquet(self._load_path+"/partition=Q/*.parquet") quote=quote_common.select("trade_dt","symbol","exchange","event_tm","event_seq_nb","arrival_tm", \ "bid_pr","bid_size","ask_pr","ask_size") quote_corrected=quote.withColumn("row_number",F.row_number().over(Window.partitionBy(quote.trade_dt,quote.symbol,\ quote.exchange,quote.event_tm,quote.event_seq_nb).\ orderBy(quote.arrival_tm.desc()))).filter(F.col("row_number")==1).drop("row_number") quote_corrected.show(3, truncate=False) quote_corrected.printSchema() logging.debug( "Writting transformed quote dataframe to a trade partition") quote_corrected.withColumn("trade_date", F.col("trade_dt")).write.\ partitionBy("trade_dt").mode("overwrite").parquet(self._save_path+"/quote/")
def test_window_functions_without_partitionBy(self): df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.orderBy("key", df.value) from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))), F.row_number().over(w), F.rank().over(w), F.dense_rank().over(w), F.ntile(2).over(w)) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 4, 1, 1, 1, 1), ("2", 1, 1, 1, 4, 2, 2, 2, 1), ("2", 1, 2, 1, 4, 3, 2, 2, 2), ("2", 2, 2, 2, 4, 4, 4, 3, 2) ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)])
def add_frequency_total(df_impressions, id_col="user_id", order_col="timestamp"): """ Calculate the `frequency_total` that is the frequency of exposure for each unique `id_col`. This method also adds the `exposure_index` (1 to `frequency_total`) in case needed for other calculation. The table is partitioned by `id_col` and ordered by `timestamp`. """ if id_col not in df_impressions.columns: raise Exception(f"The id_col {id_col} does not exist!") if order_col not in df_impressions.columns: raise Exception(f"The order_col {order_col} does not exist!") window = Window.partitionBy(id_col) df = (df_impressions.withColumn( "exposure_index", F.row_number().over(window.orderBy(order_col))).withColumn( "frequency_total", F.count("*").over(window))) return df
def main(spark, model_file, data_file, K): '''Main routine for Collaborative Filtering Model testing Parameters ---------- spark: SparkSession object model_file: string, path to store the model data_file: string, path to the parquet file to load K: int, evaluations are based on predictions of the top K items for each user ''' testIdx = spark.read.parquet(data_file) model = ALSModel.load(model_file) users_val = testIdx.select("user_idx").distinct() perUserPredictedItemsDF = model.recommendForUserSubset(users_val, K) perUserPredictedItemsDF = perUserPredictedItemsDF.select( "user_idx", "recommendations.track_idx").withColumnRenamed( 'user_idx', 'user').withColumnRenamed('recommendations.track_idx', 'items') w2 = Window.partitionBy('user_idx').orderBy(col('count').desc()) perUserActualItemsDF = testIdx.select( 'user_idx', 'track_idx', 'count', F.rank().over(w2).alias('rank')).where( 'rank <= {0}'.format(K)).groupBy('user_idx').agg( expr('collect_list(track_idx) as items')).withColumnRenamed( 'user_idx', 'user') perUserItemsRDD = perUserPredictedItemsDF.join( perUserActualItemsDF, 'user').rdd.map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(perUserItemsRDD) print("============================================") print("meanAveragePrecision = %.8f" % rankingMetrics.meanAveragePrecision) print("precisionAt(K) = %.8f" % rankingMetrics.precisionAt(K)) print("ndcgAt(K) = %.8f" % rankingMetrics.ndcgAt(K))
def _get_top_k_items( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, k=DEFAULT_K, ): """Get the input customer-item-rating tuple in the format of Spark DataFrame, output a Spark DataFrame in the dense format of top k items for each user. .. note:: if it is implicit rating, just append a column of constants to be ratings. Args: dataframe (pyspark.sql.DataFrame): DataFrame of rating data (in the format of customerID-itemID-rating tuple). col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. k (int): number of items for each user. Return: pyspark.sql.DataFrame: DataFrame of top k items for each user. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc()) # this does not work for rating of the same value. items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .groupby(col_user) .agg(F.collect_list(col_item).alias(col_prediction)) ) return items_for_user
def org_predictData(df_buildFeatures, df_holidayList): """ 每天凌晨开始生成下一天需要预测的数据,预测的日期从'今天'开始; 预测数据集中preceding3day_price处理逻辑:将现有数据partitionBy('flineId_noDate', 'intervalDays').orderBy(desc('departDate'))进行分区排序,\ 并取最新的3个价格的均值,作为预测数据集相对应的('flineId_noDate', 'intervalDays')的参考价格数据 """ window = Window.partitionBy('flineId_noDate', 'intervalDays').orderBy(desc('departDate')) df_preceding3day_priceFeatures = df_buildFeatures.withColumn('rank', row_number().over(window)) \ .filter("rank <= 3") \ .groupBy('flineId_noDate', 'intervalDays') \ .agg(avg('price').alias('preceding3day_price')) df_predictData = df_buildFeatures.filter((col('departDate') >= to_date(lit(todayStr2))) & (col('departDate') <= date_add(to_date(lit(todayStr2)), 90))) \ .select('departDate', 'flineId_noDate') \ .distinct()\ .withColumn('todayStr2', lit(todayStr2)) df_predictData.createOrReplaceTempView('predict_data') df_predictData = spark.sql("select t.flineId_noDate,t.departDate,date_add(date_add(todayStr2, 1), p.k) as predictDate from predict_data t\ lateral view posexplode(split(space(datediff(t.departDate,date_add(todayStr2,1))),'')) p as k,v" )\ .join(df_holidayList, col('departDate') == col('dateList'), 'left') \ .drop('dateList') \ .na.fill({'is_vacation': 'no', 'vacation_days': 0, 'day_of_vacation': 0}) \ .withColumn('org', split(col('flineId_noDate'), '_').getItem(0)) \ .withColumn('dst', split(col('flineId_noDate'), '_').getItem(1)) \ .withColumn('isReturn_type', split(col('flineId_noDate'), '_').getItem(2)) \ .withColumn('isDirect_type', split(col('flineId_noDate'), '_').getItem(4)) \ .withColumn('departYear', year('departDate')) \ .withColumn('departMonth', month('departDate')) \ .withColumn('depart_dayofmonth', dayofmonth('departDate')) \ .withColumn('depart_weekofyear', weekofyear('departDate')) \ .withColumn('depart_dayofweek', datediff('departDate', to_date(lit('2017-01-09'))) % 7 + 1)\ .withColumn('departQuarter', quarter('departDate')) \ .withColumn('intervalDays', datediff('departDate', 'predictDate')) \ .withColumn('intervalMonths', round(months_between('departDate', 'predictDate')))\ .withColumn('intervalWeeks', round(col('intervalDays') / 7)) \ .withColumn('intervalQuarters', round(col('intervalDays') / 120))\ .join(df_preceding3day_priceFeatures, ['flineId_noDate', 'intervalDays'], 'left')\ .dropna(how='any') #col('preceding3day_price')有空值,所以要删除,空缺4363499个 return df_predictData
def locale_on_date(data, date, topN, period=7, country_list=None): """ Gets the ratio of the top locales in each country over the last week. parameters: data: The main ping server date: The date to find the locale distribution topN: The number of locales to get for each country. Only does the top N. period: The number of days before looked at in the analyisis country_list: The list to find look at in the analysis output: dataframe with columns: ['country', 'submission_date_s3', 'locale', 'pct_on_locale'] """ data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) wau = data_all\ .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\ .groupBy('country')\ .agg(countDistinct('client_id').alias('WAU')) locale_wau = data_all\ .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\ .groupBy('country', 'locale')\ .agg(countDistinct('client_id').alias('WAU_on_locale'))\ .select(lit(begin).alias('start_date'), lit(date).alias('submission_date_s3'), 'country', 'WAU_on_locale', 'locale') res = locale_wau.join(wau, 'country', how='left')\ .select('start_date', 'submission_date_s3', 'country', 'WAU_on_locale', 'locale', 'WAU') rank_window = Window.partitionBy('country', 'submission_date_s3').orderBy( desc('WAU_on_locale')) return res.select('*', F.row_number().over(rank_window).alias('rank'))\ .filter(col('rank') <= topN)\ .select('submission_date_s3', 'country', 'locale', (100.0 * col('WAU_on_locale') / col('WAU')).alias('pct_on_locale'))
def test_rolling_with_non_window_op(client, ibis_windows, spark_range): """ Test context adjustment for rolling window and non window ops non window ops should calculate only with in user's context, while rolling window should calculate on expanded context. For a rolling window of 1 day, and a `count` aggregation time value 2020-01-01 a 2020-01-02 b 2020-01-03 c 2020-01-04 d with context = (2020-01-02, 2002-01-04), result will be: time value roll_count count 2020-01-02 b 2 3 2020-01-03 c 2 3 2020-01-04 d 2 3 Because there are 3 rows within user context (01-02, 01-04), count should return 3 for every row, rather 4, based on the adjusted context (01-01, 01-04). """ table = client.table('time_indexed_table') context = ( pd.Timestamp('20170102 07:00:00', tz='UTC'), pd.Timestamp('20170105', tz='UTC'), ) result_pd = table.mutate( count_1h=table['value'].count().over(ibis_windows[0]), count=table['value'].count(), ).execute(timecontext=context) spark_table = table.compile() spark_window_1h = (Window.partitionBy('key').orderBy( F.col('time').cast('long')).rangeBetween(*spark_range[0])) expected = (spark_table.withColumn( 'count_1h', F.count(spark_table['value']).over(spark_window_1h)).withColumn( 'count', F.count(spark_table['value'])).toPandas()) expected = expected[expected.time.between( *(t.tz_convert(None) for t in context))].reset_index(drop=True) tm.assert_frame_equal(result_pd, expected)
def run(): try: args = read_args() data = [ "{pwd}/data/part-0000{x}.json.gz".format(pwd=PWD, x=y) for y in range(args.files) ] spark = SparkSession.builder \ .appName("Calculates Sessions Spark") \ .getOrCreate() logger.info("build a new instance.") logger.info("read data: {data}.".format(data=data)) df = spark.read.json(path=data, schema=schema()) logger.info("rows count: {rows}.".format(rows=df.count())) # -1800 seconds = -60 seconds * 30 minutes w = Window.partitionBy("anonymous_id").orderBy("device_sent_timestamp").rangeBetween(start=-1800, end=0) # add column events_on_session, based on window spec df = df.withColumn("events_on_session", F.count(df.device_sent_timestamp).over(w)) # add column event_time, based on column device_sent_timestamp df = df.withColumn("event_time", F.to_timestamp(df.device_sent_timestamp)) # filter df by first event on session df = df.filter(df.events_on_session == 1) logger.info("distinct sessions: {rows}.".format(rows=df.count())) if args.all or args.browser: sessions_by_column(df, "browser_family", args) if args.all or args.os: sessions_by_column(df, "os_family", args) if args.all or args.device: sessions_by_column(df, "device_family", args) logger.info("finished.") except Exception as e: logger.error("failed: {msg}.".format(msg=e))
def _get_relevant_items_by_threshold( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, threshold=DEFAULT_THRESHOLD, ): """Get relevant items for each customer in the input rating data. Relevant items are defined as those having ratings above certain threshold. The threshold is defined as a statistical measure of the ratings for a user, e.g., median. Args: dataframe: Spark DataFrame of customerID-itemID-rating tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. threshold (float): threshold for determining the relevant recommended items. This is used for the case that predicted ratings follow a known distribution. Return: pyspark.sql.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ items_for_user = ( dataframe.orderBy(col_rating, ascending=False) .where(col_rating + " >= " + str(threshold)) .select(col_user, col_item, col_rating) .withColumn( col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)) ) .select(col_user, col_prediction) .dropDuplicates() ) return items_for_user
def is_monotonic_decreasing(self): """ Return boolean if values in the object are monotonically decreasing. .. note:: the current implementation of is_monotonic_decreasing uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. Returns ------- is_monotonic : boolean Examples -------- >>> ser = ks.Series(['4/1/2018', '3/1/2018', '1/1/2018']) >>> ser.is_monotonic_decreasing True >>> df = ks.DataFrame({'dates': [None, '3/1/2018', '2/1/2018', '1/1/2018']}) >>> df.dates.is_monotonic_decreasing False >>> df.index.is_monotonic_decreasing False >>> ser = ks.Series([1]) >>> ser.is_monotonic_decreasing True >>> ser = ks.Series([]) >>> ser.is_monotonic_decreasing True >>> ser.rename("a").to_frame().set_index("a").index.is_monotonic_decreasing True """ col = self._scol window = Window.orderBy(self._kdf._internal.index_scols).rowsBetween(-1, -1) return self._with_new_scol((col <= F.lag(col, 1).over(window)) & col.isNotNull()).all()
def _is_monotonic_decreasing(self): scol = self.spark.column window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1) prev = F.lag(scol, 1).over(window) cond = F.lit(True) has_not_null = F.lit(True) for field in self.spark.data_type[::-1]: left = scol.getField(field.name) right = prev.getField(field.name) compare = MultiIndex._comparator_for_monotonic_decreasing( field.dataType) # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex. # Therefore, we should check `has_not_null` over the all levels. has_not_null = has_not_null & left.isNotNull() cond = F.when(left.eqNullSafe(right), cond).otherwise( compare(left, right, spark.Column.__lt__)) cond = has_not_null & (prev.isNull() | cond) cond_name = verify_temp_column_name( self._internal.spark_frame.select( self._internal.index_spark_columns), "__is_monotonic_decreasing_cond__", ) sdf = self._internal.spark_frame.select( self._internal.index_spark_columns + [cond.alias(cond_name)]) internal = InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, col) for col in self._internal.index_spark_column_names ], index_names=self._internal.index_names, index_dtypes=self._internal.index_dtypes, ) return first_series(DataFrame(internal))
def aggregation(dataframe, aggregation_function, window_size, step_size): grouping_functions = ["avg", "sum", "min", "max"] if aggregation_function in grouping_functions: if "timestamp" not in dataframe.columns: raise ValueError("timestamp column not found!") w = Window.orderBy('timestamp').rowsBetween(0, int(window_size)) signals = dataframe.columns.copy() signals.remove('timestamp') method = getattr(pyspark.sql.functions, aggregation_function) dataframe = dataframe.select( 'timestamp', *(method(column).over(w) for column in signals), pyspark.sql.functions.monotonically_increasing_id().alias("idx")) elif aggregation_function == "first": dataframe = dataframe.withColumn( "idx", pyspark.sql.functions.monotonically_increasing_id()) else: raise ValueError( "aggregation_function should be one of avg, sum. min, max or first" ) dataframe = dataframe.filter(dataframe.idx % int(step_size) == 0) return dataframe.drop("idx")
def __init__(self, spark=None, code=None, lim=None, dataset_type='full', sample=True, balanced=False): """ Defines the arguments to be used in the data loader class: code: the census type of data you are using, {"sic", "soc"} lim: integer limit of dataframe when loading it from Hue. sample: data taken from the 1% sample """ self.code = code if not sample: if self.code == 'sic': source_string = "SELECT * FROM 2011_census_identified.cen_ident_2011_stage_0_writein_industry_std" elif self.code == 'soc': # source_string = "SELECT * FROM 2011_census_identified.cen_ident_2011_stage_0_writein_occupation_std where occ_job_title is not null and occupation_code is not null" source_string = "SELECT * FROM 2011_census_identified.cen_ident_2011_stage_0_writein_occupation_std" data_source = 'sql' return_as_pandas = False elif self.code in ['sic', 'soc']: source_string = get_file_path(self.code, dataset_type, balanced) data_source = 'file' return_as_pandas = True else: raise ValueError( "Code is not recognised. Please use either sic or soc") self.df = load_data(source_string, spark, data_source, lim, return_as_pandas) if not sample: # add index self.df = self.df.withColumn( "id", F.row_number().over( Window.orderBy(F.monotonically_increasing_id())) - 1)
def main(): conf = SparkConf().setAppName("SearchECPMStat") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) sc.setLogLevel("WARN") # data clean print('ECPMStat Start get_order_table') orderTable = get_order_table(sc, sqlContext) print('ECPMStat Finish get_order_table') print('ECPMStat Start get_sensor_table') sensorTable = get_sensor_table(sc, sqlContext) print('ECPMStat Finish get_sensor_table') print('ECPMStat Start calculate ecpm value') cond = [ orderTable['sell_label'] == sensorTable['search_word'], orderTable['productId'] == sensorTable['product_id'] ] table = orderTable.join(sensorTable, cond, 'inner').\ withColumn('exposure_ecpm', orderTable.sales_amount / sensorTable.exposure_count * 100.0).\ withColumn('click_ecpm', orderTable.sales_amount / sensorTable.click_count * 100.0).\ select('search_word', 'product_id', 'exposure_ecpm', 'click_ecpm') # result limit table = table.withColumn( 'row_number', row_number().over( Window.partitionBy("search_word").orderBy(desc("exposure_ecpm")))) table = table.filter(table['row_number'] <= CONFIG['product_list_limit_per_query']).\ drop('row_number') if CONFIG['do_save_table']: table.write.saveAsTable('recommend.ecpm_final' + CONFIG['table_suffix'], mode='overwrite') print('ECPMStat Finish calculate ecpm value') print('ECPMStat finished!############################################') sc.stop()
def create_city_dimention(spark, cities_fact_df, temprature_df): try: # Select distinct cities cities_dims_df = cities_fact_df.select("City", "State", "State Code").distinct() # Assign unique id to each city as city_code cities_window = Window.orderBy(f.col("City"), f.col("State")) cities_dims_df = cities_dims_df.withColumn( "city_code", f.row_number().over(cities_window)).withColumnRenamed( "State Code", "state_code") #Joining with Temprature DF to get Lat ,Long & Temp cities_dims_df = cities_dims_df.join(temprature_df, on=['City'], how='left') return cities_dims_df except Exception as e: print(str(e))
def SupplementTags2(spark, TagSynMapping, TagName): # http://ec2-44-235-91-5.us-west-2.compute.amazonaws.com:8888/notebooks/findspark.py.ipynb TagSynMapping = TagSynMapping.select( col("key").alias("TagName"), col("value").alias("id")) TagName = TagName.drop('Count').withColumn("id", lit(None).cast( StringType())).select('*') # Concate two column result = TagSynMapping.union(TagName) w = Window.partitionBy('TagName') result = result.select('TagName', 'id', count('TagName').over(w).alias('n')) # Get rid of duplicate result = result.dropDuplicates(subset=['TagName']) # Generate mapping from name to mapping = result.groupBy(result.id).count().select( 'id', col('count').alias('n')).withColumn( "id2", monotonically_increasing_id()).select('*') #mapping.show() def working_fun2(mapping): def f(x): return mapping.value.get(x) return udf(f) mapping = {entry[0]: entry[2] for entry in mapping.collect()} b = spark.sparkContext.broadcast(mapping) result = result.withColumn( 'new_id', when(col('id').isNotNull(), working_fun2(b)(col('id'))).otherwise( monotonically_increasing_id())) result = result.drop('id', 'n') #result.show() return result
def stratify_data(self, dataset): nfolds = self.numFolds df = dataset.withColumn("id", monotonically_increasing_id()) windowval = (Window.partitionBy( self.labelCol).orderBy('id').rangeBetween( Window.unboundedPreceding, 0)) stratified_data = df.withColumn('cum_sum', f.sum(f.lit(1)).over(windowval)) stratified_data = stratified_data.withColumn("bucket_fold", f.col("cum_sum") % nfolds) if self.stratify_summary: self.stratify_summary = stratified_data.withColumn("bucket_fold",f.concat(f.lit("fold_"),f.col("bucket_fold") + 1)).\ groupby(self.labelCol).\ pivot("bucket_fold").\ agg(f.count("id")) else: self.stratify_summary = "To create summary rerun with stratify_summary = True" stratified_data = stratified_data.drop(*["id", "cum_sum"]) return stratified_data
def transformData(prices): # Window to get last trading day's metrics for each stock w = Window.partitionBy('Symbol').orderBy('Date') # Compute daily stock price changes returns = prices.withColumn('Return', (F.col('Adj Close')/F.col('Open')).astype('decimal(10,4)'))\ .withColumn('Spread', (F.col('High')/F.col('Low')).astype('decimal(10,5)'))\ .na.fill({'Return': 0.0, 'Spread': 0.0})\ .select('Date','Symbol','Sector','Industry','Return','Spread','Volume')\ .withColumn('yReturn', F.lag('Return').over(w))\ .withColumn('ySpread', F.lag('Spread').over(w))\ .withColumn('yVolume', F.lag('Volume').over(w))\ .na.fill({'yReturn': 0.0, 'ySpread': 0.0, 'yVolume': 0}) # Filter out extraordinary daily returns or trading volumes (noise) returns = returns.filter(returns.Return.between(0.5,1.5) & returns.yReturn.between(0.5,1.5) & returns.Spread.between(0.5,1.5) & returns.ySpread.between(0.5,1.5))\ .filter('Volume>0 and yVolume>0') return returns
def etl_load_trade(self): logging.debug("Inside transform parking occupancy dataset module") trade_common = self.spark.read.\ parquet(self._load_path+"/partition=T/*.parquet") trade = trade_common.select("trade_dt", "symbol", "exchange",\ "event_tm","event_seq_nb", "arrival_tm", "trade_pr") trade_corrected=trade.withColumn("row_number",F.row_number().over(Window.partitionBy(trade.trade_dt,\ trade.symbol,trade.exchange,trade.event_tm,trade.event_seq_nb) \ .orderBy(trade.arrival_tm.desc()))).filter(F.col("row_number")==1).drop("row_number") trade_corrected.show(3, truncate=False) logging.debug( "Writting transformed trade dataframe to a trade partition") trade_corrected.withColumn("trade_date", F.col("trade_dt")).write.\ partitionBy("trade_dt").mode("overwrite").parquet(self._save_path+"/trade/")
def get_relevant_items_by_threshold( dataframe, col_user="******", col_item="itemID", col_rating="rating", threshold=3.5, ): """Get relevant items for each customer in the input rating data. Relevant items are defined as those having ratings above certain threshold. The threshold is defined as a statistical measure of the ratings for a user, e.g., median. Args: dataframe: Spark DataFrame of customerID-itemID-rating tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. threshold: threshold for determining the relevant recommended items. This is used for the case that predicted ratings follow a known distribution. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ items_for_user = ( dataframe .orderBy(col_rating, ascending=False) .where(col_rating + " >= " + str(threshold)) .select( col_user, col_item, col_rating ) .withColumn("prediction", F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, "prediction") .dropDuplicates() ) return items_for_user
def back_fill_dataframe(df, partition_col, columns_to_fill): df = df.withColumn('Dummy_ID', F.monotonically_increasing_id()) # backfilled_df = df.withColumn('Dummy_ID', F.monotonically_increasing_id()) backfilled_final_df = df.select('Dummy_ID', 'PropHash') for column_to_fill in columns_to_fill: df_a = df.select('Dummy_ID', 'PropHash', column_to_fill) df_b = df.select('Dummy_ID', 'PropHash', column_to_fill) backfilled_df = df_a.crossJoin(df_b).where((df_a[partition_col] >= df_b[partition_col]) & (df_a[column_to_fill].isNotNull() | df_b[column_to_fill].isNotNull())) #for c in backfilled_df.columns: print('Column is {}'.format(column_to_fill)) back_fill_window = Window.partitionBy(df_a[partition_col]).orderBy(df_b[partition_col]) backfilled_df = backfilled_df.withColumn('row_num', F.row_number().over(back_fill_window)) backfilled_df = backfilled_df.filter(F.col('row_num') == 1) backfilled_df = backfilled_df.select(df_a.Dummy_ID, df_a.PropHash, F.coalesce(df_a[column_to_fill], df_b[column_to_fill]).alias(column_to_fill)) backfilled_final_df = backfilled_final_df.join(backfilled_df, ['Dummy_ID', 'PropHash'], 'left') return backfilled_final_df
def assign_group(model_dict, df, d, strata_cols, colname): '''given (1) a dictionary of ranges, (2) a DF with random values ranked by random block, and (3) a name for the grouped columns, return DF with a new column that assigns group membership''' window = Window.orderBy('dataset_rnd')\ .partitionBy(*model_dict['strata_cols']) df = df.withColumn('dataset_rk', F.percent_rank().over(window)) for i, (k, v) in enumerate(d.iteritems()): ## if the bottom is 0, make it -1 to include 0 min_val = -1 if v[0] == 0 else min_val if i == 0: group_assign_cond = F.when((col('dataset_rk') > min_val) & (col('dataset_rk') <= v[1]), F.lit(k)) else: group_assign_cond = group_assign_cond.when( (col('dataset_rk') > min_val) & (col('dataset_rk') <= v[1]), F.lit(k)) return df.withColumn(colname, group_assign_cond)
def window_sample(): def make_keywords_count_kv(r): for keyword in r.keywords: yield (r.region, keyword), r.twi def format_db((region, records)): out = [{'keyword': r.keyword, 'score': r.score} for r in records] return region, 'keywords', json.dumps(out) sc = SparkContext() sql_ctx = SQLContext(sc) scheme = StructType([ StructField('region', StringType()), StructField('keywords', ArrayType(StringType())), StructField('twi', IntegerType()), ]) rdd = sc.parallelize( [('bj', ['ni'], 10), ('bj', ['ni'], 10), ('sh', ['ni'], 20), ('sh', ['ni'], 30), ('sh', ['wo'], 70)]) df = sql_ctx.createDataFrame(rdd, scheme) window = Window.partitionBy('region').orderBy(functions.desc('score')) df = df.rdd \ .flatMap(make_keywords_count_kv) \ .reduceByKey(add) \ .map(lambda (k, v): list(k) + [v]) \ .toDF(['region', 'keyword', 'score']) \ .withColumn('rank', functions.row_number().over(window)) \ .where('rank<=30') \ .coalesce(500) \ .rdd \ .keyBy(lambda r: r.region) \ .groupByKey() \ .map(format_db).toDF(['region', 'keywords', 'scores']) for item in df.toJSON().take(10): print(item)
def compile_window_op(t, expr, scope, **kwargs): op = expr.op() window = op.window operand = op.expr group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else compile_with_scope(t, key, scope) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by ordering_keys = [ key.to_expr().get_name() for key in map(operator.methodcaller('op'), order_by) ] pyspark_window = Window.partitionBy(grouping_keys).orderBy(ordering_keys) result = t.translate(operand, scope, window=pyspark_window) return result
def history_report(year_month): sc = SparkContext() spark = SparkSession.builder\ .appName("history_report")\ .getOrCreate() history_report = spark.read.parquet( f"hdfs://teb101-1.cloudera.com//user/admin/twstock/analysis/daily_report/{year_month}*.parquet" ) history_report.createOrReplaceTempView("history_report") sql = "SELECT sID,bID,Date,total_buy,total_sell,\ (total_buy*avg_buy_price) as cost,(total_sell*avg_sell_price) as recieve,\ storage,close\ FROM history_report\ ORDER BY sID,bID,Date" ## Using Window to cumsum data window = Window.orderBy("Date").partitionBy("sID", "bID").rangeBetween( Window.unboundedPreceding, 0) history_report = spark.sql(sql).withColumn('cumsum_storage', sum('storage').over(window))\ .ithColumn('cumsum_cost', (sum('cost')).over(window))\ .withColumn('cumsum_buy', (sum("total_buy")).over(window))\ .withColumn('cumsum_recieve', (sum('recieve')).over(window))\ .withColumn('cumsum_sell', (sum('total_sell')).over(window)) # alias column name history_report = history_report.select("sID","bID","Date","cumsum_cost","cumsum_buy","cumsum_recieve","cumsum_sell",\ (col("cumsum_cost") / col("cumsum_buy")).alias("avg_buy_price"),\ (col("cumsum_recieve") / col("cumsum_sell")).alias("avg_sell_price"),\ "close","cumsum_storage") # changing Type of data history_report = history_report.withColumn("avg_buy_price",col("avg_buy_price").cast(FloatType()))\ .withColumn("avg_sell_price",col("avg_sell_price").cast(FloatType()))\ .withColumn("cumsum_storage",col("cumsum_storage").cast(FloatType())) history_report.write.mode("append").parquet( f"/user/admin/twstock/analysis/history/")
def extract_road_segments_df(spark, use_cache=True): cache = workdir + "data/road-network.parquet" if os.path.isdir(cache) and use_cache: print("Skip extraction of road network dataframe: already done," " reading from file") return spark.read.parquet(cache) print("Extracting road network dataframe...") cols = [ "street_name", "street_type", "center_long", "center_lat", "coord_long", "coord_lat", "nid", ] road_seg_df = get_road_segments_RDD(spark).flatMap(kml_extract_RDD).toDF( cols) # Some specific road segments have the same nid w = Window.partitionBy("nid").orderBy("center_lat") street_ids = (road_seg_df.select( "nid", "center_lat", "center_long").distinct().select( "center_lat", "center_long", concat("nid", row_number().over(w)).alias("street_id"), )) road_seg_df = road_seg_df.join(street_ids, ["center_lat", "center_long"]).drop("nid") if use_cache: road_seg_df.write.parquet(cache) print("Extracting road network dataframe done") return road_seg_df
| 3| a| | 1| b| | 2| b| | 3| b| | 1| c| | 3| c| +----+---+ """ schema = StructType([StructField("attr", StringType(), True), StructField("obj", StringType(), True)]) aoDF = sqlCtx.createDataFrame(aoPair, schema) #Window that moves over rows of same obj and sorted by attr window = Window.orderBy("attr").partitionBy("obj") ## Prev column contains previous attr of the same object """ Transformed Table +----+---+----+ |attr|obj|prev| +----+---+----+ | 1| a|null| | 2| a| 1| | 3| a| 2| | 1| b|null| | 2| b| 1| | 3| b| 2| | 1| c|null| | 3| c| 1|
def spark_chrono_split( data, ratio=0.75, min_rating=1, filter_by="user", col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Spark chronological splitter This function splits data in a chronological manner. That is, for each user / item, the split function takes proportions of ratings which is specified by the split ratio(s). The split is stratified. Args: data (spark.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two sets and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Returns: list: Splits of the input data as spark.DataFrame. """ if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError("min_rating should be integer and larger than or equal to 1.") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item if min_rating > 1: data = min_rating_filter_spark( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) ratio = ratio if multi_split else [ratio, 1 - ratio] ratio_index = np.cumsum(ratio) window_spec = Window.partitionBy(split_by_column).orderBy(col(col_timestamp)) rating_grouped = ( data.groupBy(split_by_column) .agg({col_timestamp: "count"}) .withColumnRenamed("count(" + col_timestamp + ")", "count") ) rating_all = data.join(broadcast(rating_grouped), on=split_by_column) rating_rank = rating_all.withColumn( "rank", row_number().over(window_spec) / col("count") ) splits = [] for i, _ in enumerate(ratio_index): if i == 0: rating_split = rating_rank.filter(col("rank") <= ratio_index[i]) else: rating_split = rating_rank.filter( (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) ) splits.append(rating_split) return splits
def processData(sc,hc,fs,con,incFileName,inThresh,outThresh,progTag): #incFileName="hdfs://namenode.navroomhdp.com:8020/data/t_cf_inc/100033/t_cf_20161028.txt" #inThresh=10 #outThresh=300 #************************************** # #this procedure will use incfile to caculate #flow(everyday one record) store as file #indoor(everyday one record) store as file #indoor_for_delete(every indoor records) store in hbase #indoor detail(every indoor records) store in hbase and as file # #destIndoorFile : /data/indoor/entityid/year/id_date.json used to generate report #destFlowFile : /data/flow/entityid/year/fl_date.json used to generate report #rec_destIndoorfile : /data/rec_indoor/entityid/year/id_date.json this folder is mirror of hbase records # # #************************************** destIndoorFile=get_str_indoorFileName(incFileName) #hdfs://namenode.navroomhdp.com:8020/data/indoor/100033/2016/id_20161028.txt rec_destIndoorfile=destIndoorFile.replace("/indoor/","/rec_indoor/") #hdfs://namenode.navroomhdp.com:8020/data/rec_indoor/101762/2016/id_20161011.txt destFlowFile =destIndoorFile.replace("/indoor/","/flow/").replace("id_","fl_") #hdfs://namenode.navroomhdp.com:8020/data/flow/101762/2016/fl_20161011.txt tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag) tmp_destFlowFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag) tmp_rec_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag) EntityID=int(get_str_entityID(incFileName)) #101762 histFileName=get_str_histFileName(incFileName) #processed file will be place here #hdfs://namenode.navroomhdp.com:8020/data/t_cf/101762/t_cf_20161011.txt if fs.exists(sc._jvm.Path(histFileName)): tmpFileName=get_str_tmpFileName(histFileName) #tmpFileName = hdfs://namenode.navroomhdp.com:8020/data/tmp/101762/t_cf_20161011.txt tmpFolderName=tmpFileName.rsplit('/',1)[0]+"tmp" #tmpFolderName=hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp #copy hist file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/hist and distroy the hist file sc._jvm.FileUtil.copy(fs,sc._jvm.Path(histFileName),fs,sc._jvm.Path(tmpFolderName+"/hist"),True,True,con) #copy inc file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/inc and destroy the inc file sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(tmpFolderName+"/inc"),True,True,con) #copymerge the 2 files (inc and hist) into one file sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmpFolderName),fs,sc._jvm.Path(tmpFileName),True,con,None) sc._jvm.FileUtil.copy(fs,sc._jvm.Path(tmpFileName),fs,sc._jvm.Path(incFileName),True,True,con) unixFirtDayofMonth = get_int_firstDayUnixDate(incFileName) # firtDayofMonth= 1475251200 it is 20161001 unixdate startUnixTime=get_int_fileNameUnixDate(incFileName) #1456808400 this is today's unix datetime rows_t_cf=sc.textFile(incFileName).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], stime=p[1],flag=p[2])) HiveContext.createDataFrame(hc,rows_t_cf).registerTempTable("t_cf_inc_tmp") hc.sql("select distinct clientmac,stime,flag from t_cf_inc_tmp").registerTempTable("t_cf") df=hc.sql("select distinct ClientMac,stime ,lag(stime) over (partition by ClientMac order by stime) as lag_time ,lead(stime) over (partition by ClientMac order by stime) as lead_time from t_cf where flag=1") df1=df.withColumn("diff" , df["stime"]-df["lag_time"]).na.fill(-1) df1.filter((df1.diff>=outThresh)|(df1.lag_time ==-1)|( df1.lead_time==-1)).registerTempTable("df2") df2=hc.sql("select ClientMac,stime,lag_time,lead_time,case when (diff < "+ str(outThresh) +" and diff>0) then diff ELSE 0 end as diff from df2") df3=df2.withColumn("lag_time1",df2.lag_time+df2.diff).drop( "lag_time") df3.withColumn("lag_time2",func.lead("lag_time1").over(Window.partitionBy("clientMac"))).registerTempTable("df3") df4=hc.sql("select ClientMac,cast(stime as int) as ETime ,cast(lag_time2 as int) as LTime,cast((lag_time2- stime) as int) as Seconds from df3").na.fill(-1) df5=df4.filter((df4.LTime>0)&(df4.Seconds>=inThresh)&(df4.ETime>startUnixTime)&(df4.ETime<(startUnixTime+86400))).withColumn("ENTITYID",lit(EntityID)) #86400 is seonds in one day df5.registerTempTable("df5") #DF5 will be save to hbase as indoor details(rec_destIndoorfile) ,df6 and df7 will be used for stats caculation df6=hc.sql("select ClientMac,ETime, LTime, Seconds ,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df5") df6.registerTempTable("df6_indoor") df7=hc.sql("select ClientMac,min(etime) as etime,max(ltime) as ltime,sum(Seconds) as seconds,utoday from df6_indoor group by ClientMac,utoday") df_current_result=df7.withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth)) flow_sql= "select ClientMac,min(stime) as etime,max(stime) as ltime from t_cf where stime >"+str(startUnixTime) + " and stime <"+str(startUnixTime+86400)+" group by clientmac" hc.sql(flow_sql).registerTempTable("df_flow_tmp") df_flow=hc.sql("select ClientMac,etime,ltime,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df_flow_tmp").withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth)) #df_flow.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_FLOW_TODAY") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save() #df_flow.saveAsTable("T_FLOW") if len(df5.head(1))==1: #df5 is not empty better than df5.rdd.isEmpty tmp_rec_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag) df5.select('clientmac','entityid','etime','ltime','seconds').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_rec_destIndoorFolder) #df5.write.mode('overwrite').json(tmp_rec_destIndoorFolder) df5.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save() if fs.exists(sc._jvm.Path(rec_destIndoorfile)): #the old indoor folder exists,will generate df_delete_pk for phoenix to delete invalid rows rows_rec_indoor=sc.textFile(rec_destIndoorfile).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=str(p[0]), entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]))) HiveContext.createDataFrame(hc,rows_rec_indoor).registerTempTable("df_old_indoor") df_old_indoor_pk=hc.sql("select ClientMac,ENTITYID,ETime from df_old_indoor") df_current_result_pk=hc.sql("select ClientMac,ENTITYID,ETime from df5") df_delete_pk = df_old_indoor_pk.subtract(df_current_result_pk) if len(df_delete_pk.head(1))==1: df_delete_pk.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR_FOR_DELETE").option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save() else: tmp_rec_destIndoorFolder="NONE" if len(df_flow.head(1))==1: tmp_destFlowFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag) df_flow.select('clientmac','entityid','etime','ltime','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destFlowFolder) #df_flow.write.mode('overwrite').json(tmp_destFlowFolder) else: tmp_destFlowFolder="NONE" if len(df_current_result.head(1))==1: tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag) df_current_result.select('clientmac','entityid','etime','ltime','seconds','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destIndoorFolder) #df_current_result.write.mode('overwrite').json(tmp_destIndoorFolder) else: tmp_destIndoorFolder="NONE" sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(histFileName),True,True,con) if fs.exists(sc._jvm.Path(destIndoorFile)): fs.delete(sc._jvm.Path(destIndoorFile)) if fs.exists(sc._jvm.Path(destFlowFile)): fs.delete(sc._jvm.Path(destFlowFile)) if fs.exists(sc._jvm.Path(rec_destIndoorfile)): fs.delete(sc._jvm.Path(rec_destIndoorfile)) #delete is a must if file already exists otherwise copymerge will fail if tmp_destIndoorFolder!="NONE": sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destIndoorFolder),fs,sc._jvm.Path(destIndoorFile),True,con,None) #destIndoorFile=get_str_indoorFileName(incFileName) if tmp_destFlowFolder!="NONE": sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destFlowFolder),fs,sc._jvm.Path(destFlowFile),True,con,None) if tmp_rec_destIndoorFolder!="NONE": sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_rec_destIndoorFolder),fs,sc._jvm.Path(rec_destIndoorfile),True,con,None)