예제 #1
0
#result = df.rdd.map(lambda row: ((row['item_third_cate_cd']), row)).groupByKey().flatMap(calculate_baseline_cid3)
fi_df = spark.sql('''
SELECT
    *
FROM %s
''' % (spa_utils.rename('app.app_pa_festival_information', params)))

fi_pd = fi_df.toPandas()
result = df.rdd \
    .map(lambda row: ((row['item_third_cate_cd']), row)).groupByKey() \
    .flatMap(lambda row: calculate_baseline_cid3(row, 'self', fi_pd))

# 结果保存为Spark Dataframe
result_df = spark.createDataFrame(result.map(format_result_cid3),
                                  schema=SCHEMA_OUTPUT_CID3)
result_df = result_df.na.drop()
result_df = result_df\
    .withColumn('dt', F.lit(update_end))\
    .select('date', 'item_third_cate_cd', 'final_baseline','dt')

spark.sql("set hive.exec.dynamic.partition=true")
spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
logger.info('Saving results...')
logger.info('inserting app.app_pa_baseline_cid3...')
spa_utils.save_result(result_df,
                      'app.app_pa_baseline_cid3',
                      partitioning_columns=['dt'],
                      write_mode=params['write_mode'],
                      spark=spark,
                      params=params)
logger.info('insert table done')
예제 #2
0
''' % (spa_utils.rename('app.app_pa_sales_duration', params)))
df_sales_dtsku_start_end_date = df_sales_dtsku_start_end_date \
    .withColumn('start_date', F.when(F.col('start_date') >= update_start, F.col('start_date')).otherwise(update_start))
df_sales_dtsku_start_end_date.cache()

# 填充各sku跨度的日期特征
df_sku_duration_time = df_sales_dtsku_start_end_date\
    .join(df_time,
          df_time['dt'] >= df_sales_dtsku_start_end_date['start_date'],
          'left')\
    .drop('start_date')\
    .filter(F.col('dt').isNotNull())
df_sku_duration_time.cache()

# 合并
df_complete =  df_sku_duration_time\
    .join(df_sales_dtsku,
          ['dt', 'item_sku_id'], 'left')\
    .join(df_stock,
          ['dt', 'item_sku_id'], 'left')\
    .fillna(0)
df_complete = df_complete.select(['item_sku_id', 'newyear', 'springfestival', 'tombsweepingfestival', 'labourday', 'dragonboatfestival', 'midautumnfestival', 'nationalday', 'h1111mark', 'h618mark', 'h1212mark', 'week_of_year', 'day_of_year', 'day_of_week', 'free_gift_flag', 'ghost_offer_flag', 'dq_and_jq_pay_flag', 'jq_pay_flag', 'dq_pay_flag', 'full_minus_offer_flag', 'suit_offer_flag', 'sku_offer_flag', 'non_promo_flag', 'sale_qtty', 'after_prefr_amount', 'before_prefr_amount', 'synthetic_before_prefr_amount', 'participation_rate_full_minus_and_suit_offer', 'participation_rate_dq_and_jq_pay', 'sku_offer_discount_rate', 'full_minus_offer_discount_rate', 'suit_offer_discount_rate', 'ghost_offer_discount_rate', 'dq_and_jq_pay_discount_rate', 'jq_pay_discount_rate', 'dq_pay_discount_rate', 'free_gift_discount_rate', 'out_of_stock_flag', 'dt'])

spark.sql("set hive.exec.dynamic.partition=true")
spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
logger.info('inserting app.app_pa_features_dtsku...')
spa_utils.save_result(df_complete,
                      'app.app_pa_features_dtsku',
                      partitioning_columns=['dt'],
                      write_mode=params['write_mode'],
                      spark=spark, params = params)
# 读取流量表
traffic_df = spa_utils.read_table('app_cmo_ol_client_sku_3_to_bjmart_di',
                                  start=update_start,
                                  end=update_end,
                                  spark=spark,
                                  params=params,
                                  sep='\t',
                                  header=True,
                                  schema=traffic_schema)

# app.app_pa_traffic_dtsku
# sku流量模型
# 粒度(dt, sku)
df_sku_traffic = traffic_df\
    .groupBy(['item_sku_id', 'dt'])\
    .agg(
    F.sum('pv').alias('pv'),
    F.sum('uv').alias('uv')
)
df_sku_traffic = df_sku_traffic.select(['item_sku_id', 'pv', 'uv', 'dt'])

spark.sql("set hive.exec.dynamic.partition=true")
spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
logger.info('inserting app.app_pa_traffic_dtsku...')
spa_utils.save_result(df_sku_traffic,
                      'app.app_pa_traffic_dtsku',
                      partitioning_columns=['dt'],
                      write_mode=params['write_mode'],
                      spark=spark,
                      params=params)