Python write_hbase1 예제들, rules.write_hbase.write_hbase1 Python 예제들

예제 #1

0

파일 보기

def get_mon_order_except():
    try:
        co_cust = get_valid_co_cust(spark).select("cust_id", "cust_seg",
                                                  "com_id")

        # -----------------------获取co_co_01
        co_co_01 = get_co_co_01(spark,scope=[1,1],filter="month") \
            .select("qty_sum", "amt_sum", "cust_id")
        #每个零售户的订货总量 总订货额
        qty_amt_sum = co_co_01.groupBy("cust_id")\
            .agg(f.sum("qty_sum").alias("order_sum"), f.sum("amt_sum").alias("amt_sum")) \
            .join(co_cust, "cust_id")

        # -----零售店单月订购总量异常
        try:
            order_sum = qty_amt_sum.select("com_id", "cust_seg", "cust_id",
                                           "order_sum")

            cols = {
                "value": "order_sum",
                "abnormal": "sum_abno_month",
                "mean_plus_3std": "grade_sum_plus3",
                "mean_minus_3std": "grade_sum_minu3",
                "mean": "grade_sum"
            }
            values = list(cols.values())
            values.remove(cols["value"])

            print(f"{str(dt.now())}  零售店单月订购总量异常")
            result = is_except(order_sum, cols, ["com_id", "cust_seg"])
            result.foreachPartition(lambda x: write_hbase1(x, values, hbase))
        except Exception as e:
            tb.print_exc()

        #-----零售店单月订货条均价异常
        try:
            avg_price=qty_amt_sum.withColumn("avg_price", col("amt_sum")/col("order_sum"))\
                     .select("com_id","cust_seg","cust_id","avg_price")

            cols = {
                "value": "avg_price",
                "abnormal": "price_abno_month",
                "mean_plus_3std": "grade_price_plus3",
                "mean_minus_3std": "grade_price_minu3",
                "mean": "grade_price"
            }
            values = list(cols.values())
            values.remove(cols["value"])

            print(f"{str(dt.now())}  零售店单月订货条均价异常")
            result = is_except(avg_price, cols, ["com_id", "cust_seg"])
            result.foreachPartition(lambda x: write_hbase1(x, values, hbase))
        except Exception as e:
            tb.print_exc()

    except Exception as e:
        tb.print_exc()

예제 #2

0

파일 보기

def get_last_order_except():
    try:

        co_co_01 = get_co_co_01(spark,[0,30])\
                           .select("qty_sum", "amt_sum", "cust_id", "born_date")

        co_cust = get_valid_co_cust(spark).select("cust_id", "cust_seg",
                                                  "com_id")

        #零售户上一次订货量  取零售户日期最大的订单
        qty_amt_sum = co_co_01.groupBy("cust_id").agg(f.max("born_date").alias("born_date")) \
            .join(co_co_01, ["cust_id", "born_date"]) \
            .join(co_cust, "cust_id")

        #-----零售店单次订购总量异常
        #1.如果上一次为退货，即订单量为负
        try:
            order_sum = qty_amt_sum.select("com_id", "cust_seg", "cust_id",
                                           "qty_sum")

            cols = {
                "value": "qty_sum",
                "abnormal": "sum_abno_time",
                "mean_plus_3std": "last_sum_plus3",
                "mean_minus_3std": "last_sum_minu3",
                "mean": "last_sum_mean"
            }
            values = list(cols.values())

            print(f"{str(dt.now())}  零售店上次订购总量异常")
            result = is_except(order_sum, cols, ["com_id", "cust_seg"])
            result.foreachPartition(lambda x: write_hbase1(x, values, hbase))
        except Exception:
            tb.print_exc()

        # -----零售店单次订货条均价异常
        try:
            avg_price = qty_amt_sum.withColumn("avg_price", col("amt_sum")/col("qty_sum")) \
                .select("com_id", "cust_seg", "cust_id", "avg_price")

            cols = {
                "value": "avg_price",
                "abnormal": "avg_abno_time",
                "mean_plus_3std": "last_avg_plus3",
                "mean_minus_3std": "last_avg_minu3",
                "mean": "last_avg_mean"
            }
            values = list(cols.values())

            print(f"{str(dt.now())}  零售店上次订货条均价异常")
            result = is_except(avg_price, cols, ["com_id", "cust_seg"])
            result.foreachPartition(lambda x: write_hbase1(x, values, hbase))
        except Exception:
            tb.print_exc()
    except Exception:
        tb.print_exc()

예제 #3

0

파일 보기

def generate_brand_data():
    try:

        hbase = {"table": brand_table, "families": ["0"], "row": "brand_id"}
        print(f"{dt.now()} 生成品牌名称 {brand_table}")

        #写入品牌表
        plm_item = get_plm_item(spark).select("item_id", "item_name",
                                              "item_kind", "is_mrb")
        # item_kind !=4 4为罚没  is_mrb=1 1为在使用   去掉空格  去掉*   中文括号用英文括号替换
        #处理item_name
        #??? 可以用所有数据 然后drop掉没有括号的
        plm_item_brand = plm_item\
                    .where(col("item_kind") != "4") \
                    .where(col("is_mrb") == "1") \
                    .withColumn("brand_name",item_name_udf(col("item_name")))\
                    .dropDuplicates(["brand_name"]) \
                    .where(col("brand_name") != "")

        from pyspark.sql import Window
        win = Window.orderBy("item_id")
        brand=plm_item_brand.select("item_id", "item_name", "brand_name") \
                       .withColumn("brand_id", f.row_number().over(win))
        brand.foreachPartition(
            lambda x: write_hbase1(x, ["brand_name"], hbase))
    except:
        tb.print_exc()

예제 #4

0

파일 보기

def get_warning():
    try:
        print("删除过期预警数据")
        levels=["YJFL004","YJFL012","YJFL003","YJFL001"]
        for level in levels:
             print(f"预警级别:{level}")
             delete_all(hbase["table"],row_prefix=level)



        print(f"{str(dt.now())} 预警")
        result = get_warning_result(white_list, city='岳阳市', com_id='011114306', day='20190601',
                                    cluster_dir=cluster_path+"/")

        #highprice_30days_order 里面的数值要是float类型
        result["highprice_30days_order"] = result["highprice_30days_order"].apply(
            lambda x: json.dumps(x, ensure_ascii=False))


        df = spark.createDataFrame(result)\
                  .withColumn("classify_id",f.concat_ws("_",col("classify_level1_code"),col("cust_id")))

        cols=df.columns
        cols.remove("classify_id")
        df.foreachPartition(lambda x:write_hbase1(x,cols,hbase))

    except Exception:
        tb.print_exc()

예제 #5

0

파일 보기

파일: macro_index.py 프로젝트: ZhiYinZhang/study

def get_city():
    print(f"{str(dt.now())} city county")
    try:
        get_area(spark).select("city","county","sale_center_id")\
                  .groupBy("city","sale_center_id").agg(f.collect_list("county").alias("county"))\
                  .foreachPartition(lambda x:write_hbase1(x,["city","county","sale_center_id"],hbase))
    except Exception:
        tb.print_exc()

예제 #6

0

파일 보기

파일: macro_index.py 프로젝트: ZhiYinZhang/study

def get_area_info():
    print(f"{str(dt.now())}   gdp ...")
    try:
        area_code = get_area(spark).select("county", "sale_center_id")

        population = get_city_ppl(spark).select("区县", "城镇人口", "农村人口")

        cols = [
            "county", "gdp", "gdp_add", "consumption", "consumption_add",
            "farmer", "farmer_add", "urban", "urban_add", "primary_industry",
            "primary_industry_add", "second_industry", "second_industry_add",
            "tertiary_industry", "tertiary_industry_add"
        ]

        city_info = get_city_info(spark).where(col("mtime") == f"{dt.now().year-1}-12-01") \
            .select(cols) \
            .join(population, col("county") == col("区县")) \
            .join(area_code, "county")

        # 计算每个营销中心的总城镇人口和总农村人口
        total_ppl = city_info.groupBy("sale_center_id") \
            .agg(f.sum("城镇人口").alias("urban_total_ppl"), f.sum("农村人口").alias("farmer_total_ppl"))

        # 计算每个区县的城镇人口 占 整个营销中心总城镇人口的比值
        # 计算每个区县的农村人口 占 整个营销中心总农村人口的比值
        weight = city_info.join(total_ppl, "sale_center_id") \
            .withColumn("urban_weight", col("城镇人口") / col("urban_total_ppl")) \
            .withColumn("farmer_weight", col("农村人口") / col("farmer_total_ppl"))

        # urban urban_add farmer farmer_add 加权平均
        result = weight.groupBy("sale_center_id") \
                        .agg(
                        f.sum("gdp").alias("gdp"),
                        f.avg("gdp_add").alias("gdp_add"),

                        f.sum("consumption").alias("consumption"),
                        f.avg("consumption_add").alias("consumption_add"),

                        f.sum(col("urban") * col("urban_weight")).alias("urban"),
                        f.sum(col("urban_add") * col("urban_weight")).alias("urban_add"),

                        f.sum(col("farmer") * col("farmer_weight")).alias("farmer"),
                        f.sum(col("farmer_add") * col("farmer_weight")).alias("farmer_add"),

                        f.sum("primary_industry").alias("primary_industry"),
                        f.avg("primary_industry_add").alias("primary_industry_add"),

                        f.sum("second_industry").alias("second_industry"),
                        f.avg("second_industry_add").alias("second_industry_add"),

                        f.sum("tertiary_industry").alias("tertiary_industry"),
                        f.avg("tertiary_industry_add").alias("tertiary_industry_add")
                      )
        cols.remove("county")
        result.foreachPartition(lambda x: write_hbase1(x, cols, hbase))
    except Exception:
        tb.print_exc()

예제 #7

0

파일 보기

파일: goods_supply.py 프로젝트: ZhiYinZhang/study

def county_reality_supply():
    # 各区县各档位该品规上周投放量
    try:
        print(f"{str(dt.now())} 各区县各档位该品规上周投放量")
        plm_item = get_plm_item(spark).select("item_id", "item_name")

        co_cust = get_co_cust(spark).select("cust_id", "sale_center_id",
                                            "cust_seg")

        area = get_area(spark)
        # com_id与city的映射关系
        city = area.dropDuplicates(["com_id"]).select("com_id", "city")
        # sale_center_id与区(list)的映射关系
        county = area.groupBy("sale_center_id") \
            .agg(f.collect_list("county").alias("county")) \
            .select("sale_center_id", "county")

        # 获取上周实际投放量
        # cust_item_spw = spark.sql(
        #     "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
        #     .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
        #     .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
        #     .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \
        #     .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \
        #     .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun")))\
        #     .join(co_cust,"cust_id")

        cust_item_spw = spark.sql(
            "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
            .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
            .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
            .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7 * 4)) \
            .withColumn("last_sun", f.date_add(col("last_mon"), 6 + 7 * 3)) \
            .where((col("begin_date") >= col("last_mon")) & (col("end_date") <= col("last_sun")))\
            .join(co_cust,"cust_id")

        #需要计算的值的列名
        colName = "county_gauge_week_volume"
        result = cust_item_spw.groupBy("com_id","sale_center_id","cust_seg", "item_id") \
                                .agg(f.sum("qty_allocco").alias(colName))

        columns = [
            "com_id", "city", "sale_center_id", "county", "gears", "gauge_id",
            "gauge_name", "city", "gears_data_marker", colName
        ]
        result.withColumn("row", f.concat_ws("_", col("sale_center_id"),col("cust_seg"), col("item_id"))) \
            .withColumn("gears_data_marker", f.lit("4")) \
            .join(plm_item, "item_id") \
            .join(city, "com_id") \
            .join(county,"sale_center_id")\
            .withColumnRenamed("item_id","gauge_id")\
            .withColumnRenamed("item_name","gauge_name")\
            .withColumnRenamed("cust_seg","gears")\
            .foreachPartition(lambda x: write_hbase1(x, columns, hbase))
    except Exception:
        tb.print_exc()

예제 #8

0

파일 보기

파일: goods_supply.py 프로젝트: ZhiYinZhang/study

def source_supply():
    #计算每个区每个档位对每款烟的评分

    try:
        print(f"{str(dt.now())} 每个区每个档位对每款烟的评分")
        co_cust=get_valid_co_cust(spark).select("com_id","cust_id","sale_center_id","cust_seg")\
                                        .where(col("cust_seg")!="ZZ")

        co_co_line=get_co_co_line(spark,scope=[0,30])\
                     .where((col("qty_ord")>0) & (col("qty_rsn")>0))\
                     .select("cust_id","item_id","qty_ord")
        #每个零售的每款烟的订单量
        qty_ord = co_co_line.groupBy("cust_id", "item_id").agg(
            f.sum("qty_ord").alias("amount"))

        #每个零售户对每款卷烟的评分
        item_rating = get_cigar_rating(spark)

        # 各区县各档位零售户的数量
        cust_num = co_cust.groupBy("com_id","sale_center_id", "cust_seg") \
                          .agg(f.count("cust_id").alias("county_gears_volume_num"))
        #结果
        #区,档位,品规,评分,区各档位零售户数量
        result=qty_ord.join(item_rating,["cust_id","item_id"])\
               .withColumn("rating_amount",col("rating")*col("amount"))\
               .join(co_cust,"cust_id")\
               .groupBy("com_id","sale_center_id","cust_seg","item_id")\
               .agg(f.sum(col("rating_amount")).alias("county_gears_score"))\
               .join(cust_num,["com_id","sale_center_id","cust_seg"])

        plm_item = get_plm_item(spark).select("item_id", "item_name")

        area = get_area(spark)
        # com_id与city的映射关系
        city = area.dropDuplicates(["com_id"]).select("com_id", "city")
        # sale_center_id与区(list)的映射关系
        county = area.groupBy("sale_center_id") \
            .agg(f.collect_list("county").alias("county")) \
            .select("sale_center_id", "county")

        columns = [
            "com_id", "city", "sale_center_id", "county", "gears", "gauge_id",
            "gauge_name", "county_gears_score", "county_gears_volume_num",
            "gears_data_marker"
        ]
        result.join(plm_item,"item_id")\
               .withColumn("row",f.concat_ws("_",col("sale_center_id"),col("cust_seg"),col("item_id")))\
               .join(city,"com_id")\
               .join(county,"sale_center_id")\
               .withColumn("gears_data_marker",f.lit("4"))\
               .withColumnRenamed("item_id","gauge_id")\
               .withColumnRenamed("item_name","gauge_name")\
               .withColumnRenamed("cust_seg","gears")\
               .foreachPartition(lambda x:write_hbase1(x,columns,hbase))
    except Exception:
        tb.print_exc()

예제 #9

0

파일 보기

def get_item_stats_info_daily():
    #本月当前本市/区每款卷烟订货量
    #本月当前本市/区每款卷烟订单额

    #烟id，烟名称
    plm_item = get_plm_item(spark).select("item_id", "item_name")

    area = get_area(spark)
    #com_id与city的映射关系
    city = area.dropDuplicates(["com_id"]).select("com_id", "city")
    #sale_center_id与区(list)的映射关系
    county = area.groupBy("sale_center_id")\
                        .agg(f.collect_list("county").alias("county"))\
                        .select("sale_center_id","county")
    #标识列的值
    markers = ["1", "3"]
    #按照 市或区统计
    groups = ["com_id", "sale_center_id"]
    joins = [city, county]
    # 除需要计算的值，其他的数据
    cols_comm = [["city", "gauge_id", "gauge_name", "ciga_data_marker"],
                 [
                     "county", "sale_center_id", "gauge_id", "gauge_name",
                     "ciga_data_marker"
                 ]]
    #需要计算的值的列名
    cols = [["gauge_city_orders", "gauge_city_order_amount"],
            ["gauge_county_orders", "gauge_county_order_amount"]]

    co_co_line = get_co_co_line(spark, scope=[0, 0], filter="month")\
                                 .select("item_id","qty_ord","amt","com_id","sale_center_id")
    co_co_line.cache()
    for i in range(len(groups)):
        group = groups[i]
        join = joins[i]
        c = cols[i]
        marker = markers[i]
        try:

            #2.本月每款烟在每个区的订单量,订单额
            print(f"{str(dt.now())}  本月{group}各品规的订单量,订单额")
            #com_id item_id qty_ord amt
            qty_amt=co_co_line.groupBy([group,"item_id"])\
                          .agg(f.sum(col("qty_ord")).alias(c[0]),f.sum(col("amt")).alias(c[1]))

            columns = cols_comm[i] + c
            qty_amt.withColumn("row",f.concat_ws("_",col(group),col("item_id")))\
                    .join(plm_item,"item_id")\
                    .join(join,group)\
                    .withColumnRenamed("item_id","gauge_id")\
                    .withColumnRenamed("item_name","gauge_name")\
                    .withColumn("ciga_data_marker",f.lit(marker))\
                    .foreachPartition(lambda x:write_hbase1(x,columns,hbase))
        except Exception:
            tb.print_exc()
    co_co_line.unpersist()

예제 #10

0

파일 보기

def get_ppl_areaCpt_index():
    population = get_city_ppl(spark).select("区县", "总人口")
    area_code = get_area(spark).where(col("city").rlike("株洲市|邵阳市|岳阳市")) \
        .select("city","county", "sale_center_id")
    area_plt = population.join(area_code, col("county") == col("区县"))

    # 每个区域中心的人口
    sale_center_ppl = area_plt.groupBy("city", "sale_center_id") \
        .agg(f.sum("总人口").alias("ppl"))

    # -----人口指数
    print(f"{str(dt.now())}  人口指数")
    try:
        colName = "population_index"
        # 每个市人口最多的区域中心的人口
        max_df = sale_center_ppl.groupBy("city") \
            .agg(f.max("ppl").alias("max"))

        sale_center_ppl.join(max_df, "city") \
            .withColumn(colName, col("ppl")/col("max") * 5) \
            .foreachPartition(lambda x: write_hbase1(x, [colName], hbase))
    except Exception:
        tb.print_exc()

    #-----区域竞争力指数
    try:
        # 区域零售户数目
        cust_num =spark.sql("select * from DB2_DB2INST1_CO_CUST where dt=(select max(dt) from DB2_DB2INST1_CO_CUST)").select("sale_center_id") \
            .groupBy("sale_center_id").count()

        # 人口数目/零售户数目
        print(f"{str(dt.now())}  区域竞争指数（人口数目/零售户数目）指数")
        ppl_cust = sale_center_ppl.join(cust_num, "sale_center_id") \
            .withColumn("ppl_cust", col("ppl")/col("count"))

        max_df = ppl_cust.groupBy("city").agg(f.max("ppl_cust").alias("max"))
        colName = "area_competitive_index"
        ppl_cust.join(max_df, "city") \
            .withColumn(colName, col("ppl_cust")/col("max") * 5) \
            .select("sale_center_id", colName)\
            .foreachPartition(lambda x:write_hbase1(x,[colName],hbase))
    except Exception:
        tb.print_exc()

예제 #11

0

파일 보기

파일: retail.py 프로젝트: ZhiYinZhang/study

def get_acc():
    #acc是扣款账号  不是co_cust里面的开户银行账号
    print(f"{str(dt.now())} 扣款账号")
    try:
        co_cust = get_co_cust(spark).select("cust_id")
        get_co_debit_acc(spark).select("cust_id", "acc") \
            .join(co_cust, "cust_id") \
            .foreachPartition(lambda x: write_hbase1(x, ["acc"], hbase))
    except Exception:
        tb.print_exc()

예제 #12

0

파일 보기

def get_grade_except():
    try:
        co_cust = get_valid_co_cust(spark).select("cust_id", "com_id")
        #档位变更表
        crm_cust_log=get_crm_cust_log(spark).where((col("change_type")=="CO_CUST.CUST_SEG") & (col("day_diff")<=90))\
                        .select("cust_id","change_frm","change_to","audit_date")

        #档位变更差
        grade_diff_df=crm_cust_log.groupBy("cust_id")\
                   .agg(grade_diff_udf(f.max("change_frm"), f.min("change_frm"), f.max("change_to"), f.min("change_to")).alias("grade_diff"))\
                   .join(co_cust,"cust_id")
        grade_diff_df.registerTempTable("grade_diff_df")

        colName = "grade_abno"
        print(f"{str(dt.now())}   档位变更差异常的")
        result=spark.sql("select com_id,percentile_approx(grade_diff,0.25) as percent_25,percentile_approx(grade_diff,0.75) as percent_75 from grade_diff_df group by com_id")\
                .join(grade_diff_df,"com_id")\
                .withColumn(colName,box_plots_filter_udf(col("grade_diff"),col("percent_25"),col("percent_75")))\
                .where(col(colName)>=0)
        result.foreachPartition(lambda x: write_hbase1(x, [colName], hbase))

        print(f"{str(dt.now())}  档位变更差")
        colName = "grade_abs_three_month"
        grade_diff_df.join(result,"cust_id")\
                     .withColumnRenamed("grade_diff",colName)\
                     .foreachPartition(lambda x:write_hbase1(x,[colName],hbase))

        print(f"{str(dt.now())}   档位变更次数")
        colName = "grade_change_times"
        crm_cust_log.join(result,"cust_id")\
                    .groupBy("cust_id").agg(f.count("cust_id").alias(colName))\
                    .foreachPartition(lambda x:write_hbase1(x,[colName],hbase))

        print(f"{str(dt.now())}   90天前档位")
        colName = "grade_last_three_month"
        crm_cust_log.groupBy("cust_id") \
            .agg(f.min("audit_date").alias("audit_date")) \
            .join(crm_cust_log.join(result,"cust_id"), ["cust_id", "audit_date"]) \
            .withColumnRenamed("change_frm", colName) \
            .foreachPartition(lambda x: write_hbase1(x, [colName],hbase))
    except Exception as e:
        tb.print_exc()

예제 #13

0

파일 보기

def get_around_class_except():
    try:
        print(f"{str(dt.now())}  零售店订购烟品规异常")
        co_cust = get_valid_co_cust(spark).select("cust_id")

        co_co_line = get_co_co_line(spark,scope=[1,1],filter="month")\
                                  .select("cust_id","item_id","qty_ord")

        plm_item = get_plm_item(spark).select("item_id", "item_name")

        # 1.每个零售户 每类烟 的数量
        cust_item_sum = co_co_line.join(plm_item, "item_id") \
            .groupBy("cust_id", "item_name") \
            .agg(f.sum("qty_ord").alias("cust_item_sum"))

        #2.每个零售户订购量前三的烟
        win = Window.partitionBy("cust_id").orderBy(
            col("cust_item_sum").desc())
        rank3 = cust_item_sum.withColumn("rank", f.row_number().over(win)) \
            .where(col("rank") <= 3) \
            .groupBy("cust_id") \
            .agg(f.collect_list("item_name").alias("items"))

        win = Window.partitionBy("cust_id1").orderBy(
            col("one_km_item_sum").desc())
        #每个零售户cust_id1 周边的零售户cust_id0
        around_cust = get_around_cust(spark, 1).select("cust_id1", "cust_id0")
        """
          零售户cust_id1周边包含cust_id0这些零售户 
        1.第一个join，计算每个零售户cust_id1一公里内有哪些零售户cust_id0
        2.第二个join，一公里内的cust_id0这些零售户所定各类烟的数量   cust_id0 与 cust_item_sum的cust_id
        3.根据cust_id1 item_name 计算一公里内各类烟的数量
        """
        #3.每个零售户一公里内所定烟的前三
        one_km_rank3 = around_cust.join(cust_item_sum, col("cust_id0") == col("cust_id")) \
            .select("cust_id1", "item_name", "cust_item_sum") \
            .groupBy("cust_id1", "item_name") \
            .agg(f.sum("cust_item_sum").alias("one_km_item_sum")) \
            .withColumn("rank", f.row_number().over(win)) \
            .where(col("rank") <= 3) \
            .groupBy("cust_id1") \
            .agg(f.collect_list("item_name").alias("one_km_items"))

        colName = ["regulation_abno", "ciga_top3_last_month", "ciga_top3_km"]
        #四.求交集 长度为0，异常
        rank3.join(one_km_rank3, col("cust_id") == col("cust_id1")) \
            .where(f.size(f.array_intersect(col("items"), col("one_km_items"))) == 0) \
            .withColumn(colName[0],f.lit(1))\
            .withColumnRenamed("items",colName[1])\
            .withColumnRenamed("one_km_items",colName[2])\
            .join(co_cust,"cust_id")\
            .foreachPartition(lambda x:write_hbase1(x,colName,hbase))
    except Exception as e:
        tb.print_exc()

예제 #14

0

파일 보기

파일: retail.py 프로젝트: ZhiYinZhang/study

def get_vfr_index():
    try:
        print(f"{str(dt.now())} 零售户周边人流指数")
        # 有人流数据的零售户
        vfr = get_around_vfr(spark)

        vfr.cache()

        # 零售户
        co_cust = get_co_cust(spark).select("cust_id")
        # 有经纬度的零售户
        cust_lng_lat = get_cust_lng_lat(spark) \
            .select("city", "cust_id", "lng", "lat") \
            .join(co_cust, "cust_id")

        cust_lng_lat.cache()

        # 周边没有人流的零售户
        not_df = cust_lng_lat.select("cust_id") \
            .exceptAll(vfr.select("cust_id")) \
            .join(cust_lng_lat, "cust_id")

        exist_df = vfr.join(cust_lng_lat, ["city", "cust_id"])

        if not_df.count()>0:
            #knn填充
            fill_df = fillWithKNN(exist_df.toPandas(), not_df.toPandas(), "avg_vfr")

            all_df = spark.createDataFrame(fill_df) \
                .unionByName(exist_df)
        else:
            all_df=exist_df

        #阈值
        threshold = all_df.groupBy("city") \
            .agg((f.mean("avg_vfr") + 3 * f.stddev_pop("avg_vfr")).alias("threshold"))

        truncate_df = all_df.join(threshold, "city") \
            .withColumn("avg_vfr", f.when(col("avg_vfr") > col("threshold"), col("threshold"))
                        .otherwise(col("avg_vfr"))
                        )

        log_df = truncate_df.withColumn("log", f.log(col("avg_vfr") + 1))
        log_max = log_df.groupBy("city").agg(f.max("log").alias("log_max"))

        colName = "people_count"
        log_df.join(log_max, "city") \
            .withColumn(colName, col("log") / col("log_max") * 5) \
            .foreachPartition(lambda x: write_hbase1(x, [colName], hbase))

        vfr.unpersist()
        cust_lng_lat.unpersist()
    except Exception:
        tb.print_exc()

예제 #15

0

파일 보기

파일: retail.py 프로젝트: ZhiYinZhang/study

def card_passId():
    try:
        co_cust=get_co_cust(spark).select("cust_id","identity_card_id")

        co_debit_acc=get_co_debit_acc(spark).select("cust_id","pass_id")

        colName = "license_not_match"
        result=co_cust.join(co_debit_acc, "cust_id") \
                   .withColumn(colName, isMatch(col("identity_card_id"), col("pass_id")))
        result.foreachPartition(lambda x: write_hbase1(x, ["license_not_match"],hbase))
    except Exception:
        tb.print_exc()

예제 #16

0

파일 보기

def get_lng_lat():
    print(f"{str(dt.now())}  经纬度")
    try:
        co_cust = get_valid_co_cust(spark).select("cust_id")
        crm_cust = get_crm_cust(spark).select("cust_id", "longitude",
                                              "latitude")

        co_cust.join(crm_cust,"cust_id")\
               .foreachPartition(lambda x:write_hbase1(x,["longitude","latitude"],hbase))
    except Exception:
        tb.print_exc()

    #-----网上爬取的经纬度
    print(f"{str(dt.now())}   经纬度")
    try:
        cust_lng_lat = get_cust_lng_lat(spark).select("cust_id", "lng", "lat")
        co_cust.join(cust_lng_lat,"cust_id")\
            .withColumnRenamed("lng","longitude")\
            .withColumnRenamed("lat","latitude")\
            .foreachPartition(lambda x:write_hbase1(x,["longitude","latitude"],hbase))
    except Exception as e:
        tb.print_exc()

예제 #17

0

파일 보기

def get_city_county():
    print(f"{str(dt.now())} 市 区 abcode")
    try:
        co_cust = get_valid_co_cust(spark).select("cust_id", "com_id",
                                                  "sale_center_id")

        area_code = get_area(spark)

        co_cust.join(area_code,["com_id","sale_center_id"])\
               .withColumnRenamed("城市","city")\
               .withColumnRenamed("区","county")\
               .foreachPartition(lambda x: write_hbase1(x, ["sale_center_id","city","county"],hbase))
    except Exception as e:
        tb.print_exc()

예제 #18

0

파일 보기

파일: retail.py 프로젝트: ZhiYinZhang/study

def get_cust_info():
    #零售户信息表co_cust
    co_cust_cols=["cust_id","cust_name","cust_seg","status","pay_type","license_code","manager","identity_card_id",
                  "order_tel","inv_type","order_way","periods","busi_addr","work_port","base_type","sale_scope",
                  "scope","com_chara","is_tor_tax","is_sale_large","is_rail_cust","area_type","is_sefl_cust","is_func_cust",
                  "sale_center_id","sale_dept_id","slsman_id","slsman_mobile"]
    # 需要修改的列名   与hbase的列名对应
    co_cust_renamed = {"is_tor_tax": "tor_tax", "is_sale_large": "sale_large",
                       "is_rail_cust": "rail_cust", "is_sefl_cust": "sefl_cust",
                       "is_func_cust": "func_cust", "cust_seg": "grade"
                       }
    #零售户信息表crm_cust
    crm_cust_cols=["cust_id","longitude","latitude","is_multiple_shop","org_model","is_night_shop",
                   "busi_time_type","consumer_group","consumer_attr","busi_type","compliance_grade",
                   "rim_type","info_terminal"]
    crm_cust_renamed = {"is_multiple_shop": "multiple_shop", "is_night_shop": "night_shop"}


    #送货地址
    ldm_cust = spark.sql(
        "select cust_id,dist_addr from DB2_DB2INST1_LDM_CUST where dt=(select max(dt) from DB2_DB2INST1_LDM_CUST)")
    #是否属于配送点
    ldm_cust_dist = spark.sql(
        "select cust_id,is_in_point from DB2_DB2INST1_LDM_CUST_DIST where dt=(select max(dt) from DB2_DB2INST1_LDM_CUST_DIST)")



    print(f"{str(dt.now())} co_cust/crm_cust零售户信息")
    try:
        co_cust = spark.sql("select * from DB2_DB2INST1_CO_CUST where dt=(select max(dt) from DB2_DB2INST1_CO_CUST)") \
                                 .where(col("com_id").isin(cities)).select(co_cust_cols)
        #修改列名
        for key in co_cust_renamed.keys():
            co_cust = co_cust.withColumnRenamed(key, co_cust_renamed[key])


        crm_cust=get_crm_cust(spark).select(crm_cust_cols)
        #修改列名
        for key in crm_cust_renamed.keys():
            crm_cust = crm_cust.withColumnRenamed(key, crm_cust_renamed[key])


        cust_info=co_cust.join(crm_cust, "cust_id","left")\
                          .join(ldm_cust,"cust_id","left")\
                          .join(ldm_cust_dist,"cust_id","left")
        cols=cust_info.columns
        cust_info.foreachPartition(lambda x: write_hbase1(x, cols, hbase))
    except Exception:
        tb.print_exc()

예제 #19

0

파일 보기

파일: retail.py 프로젝트: ZhiYinZhang/study

def get_city_county():
    #-----市 区
    print(f"{str(dt.now())} city county abcode")
    try:
        co_cust=get_co_cust(spark).select("cust_id","com_id","sale_center_id")

        area_code=get_area(spark).groupBy("com_id","city","sale_center_id")\
                         .agg(f.collect_list("county").alias("county"))

        co_cust.join(area_code,["com_id","sale_center_id"])\
               .withColumnRenamed("城市","city")\
               .withColumnRenamed("区","county")\
               .foreachPartition(lambda x: write_hbase1(x, ["city","county"],hbase))
    except Exception:
        tb.print_exc()

예제 #20

0

파일 보기

파일: goods_supply.py 프로젝트: ZhiYinZhang/study

def city_reality_supply():
    # 全市上周该品规计划投放量
    # 全市上周该品规实际投放量
    # 全市上周该品规投放剩余量
    try:
        print(f"{str(dt.now())} 全市上周各品规 计划/实际/剩余 量")
        plm_item = get_plm_item(spark).select("item_id", "item_name")

        area = get_area(spark)
        # com_id与city的映射关系
        city = area.dropDuplicates(["com_id"]).select("com_id", "city")

        # 获取上周
        # cust_item_spw = spark.sql(
        #     "select com_id,item_id,qty_plan,qty_remain,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
        #     .where(col("com_id").isin(cities)) \
        #     .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
        #     .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
        #     .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \
        #     .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \
        #     .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun")))

        cust_item_spw = spark.sql(
            "select com_id,item_id,qty_plan,qty_remain,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
            .where(col("com_id").isin(cities)) \
            .where((col("begin_date") == "20190708") & (col("end_date") <= "20190714"))

        #需要计算的值的列名
        cols = [
            "gauge_week_planned_volume", "gauge_week_reality_volume",
            "gauge_week_residue_volume"
        ]
        result = cust_item_spw.groupBy("com_id", "item_id") \
            .agg(f.sum("qty_plan").alias(cols[0]),
                 f.sum("qty_allocco").alias(cols[1]),
                 f.sum("qty_remain").alias(cols[2]))

        columns = ["gauge_id", "gauge_name", "city", "gears_data_marker"
                   ] + cols
        result.withColumn("row", f.concat_ws("_", col("com_id"), col("item_id"))) \
            .withColumn("gears_data_marker", f.lit("0")) \
            .join(plm_item, "item_id") \
            .join(city, "com_id") \
            .withColumnRenamed("item_id","gauge_id")\
             .withColumnRenamed("item_name","gauge_name")\
            .foreachPartition(lambda x: write_hbase1(x, columns, hbase))
    except Exception:
        tb.print_exc()

예제 #21

0

파일 보기

def get_gdp_index():
    try:
        print(f"{str(dt.now())} gdp index")
        area_code = get_area(spark).select("sale_center_id", "county")
        gdp_sum = get_city_info(spark).select("city", "county", "gdp", "mtime") \
            .where(col("mtime") == f"{dt.now().year-1}-12-01") \
            .join(area_code, "county") \
            .groupBy("city", "sale_center_id") \
            .agg(f.sum("gdp"))

        max_df = gdp_sum.groupBy("city").agg(f.max("sum(gdp)").alias("max"))
        colName = "GDP"
        gdp_sum.join(max_df, "city") \
            .withColumn(colName, col("sum(gdp)")/col("max") * 5) \
            .foreachPartition(lambda x: write_hbase1(x, [colName], hbase))

    except Exception:
        tb.print_exc()

예제 #22

0

파일 보기

def get_high_cons_level():
    try:
        co_cust = get_valid_co_cust(spark).select("cust_id")
        print(f"{str(dt.now())}  零售店高价烟比例/消费水平")
        # 消费水平
        consume_level_df = get_consume_level(spark).select(
            "cust_id", "consume_level")

        # -----------------------获取co_co_line
        co_co_line=get_co_co_line(spark,scope=[1,1],filter="month") \
            .select("cust_id","qty_ord","price","com_id")

        #零售户所定烟的总数目
        item_num = co_co_line.groupBy("com_id", "cust_id").agg(
            f.sum("qty_ord").alias("item_num"))

        #每个零售户高价烟的数量
        high_price_num = co_co_line.where(col("price") >= 500) \
            .groupBy("cust_id") \
            .agg(f.sum("qty_ord").alias("high_price_num"))
        #每个零售户高价烟比例
        high_price_ratio = item_num.join(high_price_num, "cust_id") \
            .withColumn("high_price_ratio", col("high_price_num")/col("item_num")) \
            .select("com_id", "cust_id", "high_price_ratio")


        high_consume_level=high_price_ratio.join(consume_level_df,"cust_id")\
                        .withColumn("retail_high_cons",col("high_price_ratio")/col("consume_level"))

        cols = {
            "value": "retail_high_cons",
            "abnormal": "high_cons",
            "mean_plus_3std": "high_std_plus3",
            "mean_minus_3std": "high_std_minu3",
            "mean": "city_high_cons"
        }
        values = list(cols.values())

        is_except(high_consume_level,cols,["com_id"])\
                 .join(co_cust,"cust_id")\
                 .foreachPartition(lambda x:write_hbase1(x,values,hbase))
    except Exception:
        tb.print_exc()

예제 #23

0

파일 보기

파일: retail.py 프로젝트: ZhiYinZhang/study

def get_cust_index():
    types = {"catering_cons_count": "(.*餐厅.*)|(.*咖啡厅.*)|(.*茶艺馆.*)|(.*甜品.*)",
             "convenient_trans_count": "(.*地铁站.*)|(.*公交.*)|(.*机场.*)|(.*港口码头.*)",
             "shopping_cons_count": "(.*购物中心.*)|(.*普通商场.*)|(.*免税品店.*)|(.*市场.*)|(.*特色商业街.*)|(.*步行街.*)|(.*农贸市场.*)",
             "entertainment_count": "(.*运动场馆.*)|(.*娱乐场所.*)|(.*休闲场所.*)"}
    try:
        # 餐厅、交通、商城、娱乐场馆等经纬度
        coordinate = get_coordinate(spark).select("cityname","lng", "lat", "types")

        coordinate.cache()

        # 零售户经纬度
        co_cust = get_co_cust(spark).select("cust_id")
        cust_lng_lat = get_cust_lng_lat(spark).select("cust_id", "city", "lng", "lat") \
            .join(co_cust, "cust_id")

        # 每个零售户  一公里的经度范围和纬度范围
        cust_lng_lat0 = cust_lng_lat.withColumn("scope", f.lit(1)) \
            .withColumn("lng_l", lng_l(col("lng"), col("lat"), col("scope"))) \
            .withColumn("lng_r", lng_r(col("lng"), col("lat"), col("scope"))) \
            .withColumn("lat_d", lat_d(col("lat"), col("scope"))) \
            .withColumn("lat_u", lat_u(col("lat"), col("scope")))

        cust_lng_lat0.cache()

        for y in range(len(types)):
            try:
                colName = list(types.keys())[y]
                regex = types[colName]

                print(f"{str(dt.now())} {colName}指数")

                result = poi_index(spark, cust_lng_lat0,coordinate,regex, is_fill=True) \
                    .withColumnRenamed("poi_index", colName)

                result.foreachPartition(lambda x: write_hbase1(x, [colName], hbase))
            except Exception:
                tb.print_exc()

        coordinate.unpersist()
        cust_lng_lat0.unpersist()
    except Exception:
        tb.print_exc()

예제 #24

0

파일 보기

파일: goods_supply.py 프로젝트: ZhiYinZhang/study

def recommend_supply_value():
    #建议投放量

    try:
        print(f"{str(dt.now())} 建议投放量")
        spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        co_co_line = get_co_co_line(spark, scope=[1, 365])\
                             .where(col("com_id")=="011114302")\
                             .select("co_num", "line_num", "cust_id", "item_id","sale_center_id",
                                     "qty_need", "qty_ord", "qty_rsn", "price", "amt","born_date")\
                             .withColumn("born_date",col("born_date").cast("string"))

        pd_df = co_co_line.toPandas()
        """
        item_id,2019-06-03 00:00:00
        1130309,51
        1130316,26
        """
        sub_df = predict(pd_df, 1)

        df = spark.createDataFrame(sub_df)

        value = df.columns[1]
        result = df.withColumnRenamed(value, "gauge_advise_volume") \
                     .withColumn("date", f.lit(value))

        plm_item = get_plm_item(spark).select("item_id", "item_name")

        columns = [
            "com_id", "city", "gauge_id", "gauge_name", "gauge_advise_volume",
            "date", "gears_data_marker"
        ]
        result.join(plm_item,"item_id")\
            .withColumn("com_id",f.lit("011114302"))\
            .withColumn("city",f.lit("株洲市"))\
            .withColumn("row",f.concat_ws("_",col("com_id"),col("item_id")))\
            .withColumn("gears_data_marker",f.lit("0"))\
            .withColumnRenamed("item_id","gauge_id")\
            .withColumnRenamed("item_name","gauge_name")\
            .foreachPartition(lambda x:write_hbase1(x,columns,hbase))

    except Exception:
        tb.print_exc()

예제 #25

0

파일 보기

def get_item_rating():
    #规格卷烟区域偏好分布
    hbase = {"table": als_table + "_TEMP", "families": ["0"], "row": "row"}
    try:
        #卷烟id 卷烟名称
        plm_item = get_plm_item(spark).select("item_id", "item_name")

        area = get_area(spark)
        # com_id与city的映射关系
        city = area.dropDuplicates(["com_id"]).select("com_id", "city")
        # sale_center_id与区(list)的映射关系
        county = area.groupBy("sale_center_id") \
                .agg(f.collect_list("county").alias("county")) \
                .select("sale_center_id", "county")

        #com_id,sale_center_id,city,county,cust_id,cust_name,longitude,latitude
        co_cust = get_co_cust(spark).select("cust_id", "cust_name", "com_id", "sale_center_id") \
                                    .join(get_cust_lng_lat(spark), "cust_id") \
                                    .withColumnRenamed("lng", "longitude") \
                                    .withColumnRenamed("lat", "latitude")\
                                    .join(county,"sale_center_id")\
                                    .join(city,"com_id")

        print(f"{str(dt.now())} 每个零售户对每品规烟的评分")
        columns = [
            "city", "sale_center_id", "county", "cust_id", "cust_name",
            "longitude", "latitude", "gauge_id", "gauge_name",
            "grade_data_marker", "gauge_grade"
        ]

        get_cigar_rating(spark).join(co_cust,"cust_id")\
                                  .join(plm_item,"item_id")\
                                  .withColumn("row",f.concat_ws("_",col("item_id"),col("cust_id")))\
                                  .withColumn("grade_data_marker",f.lit("1"))\
                                  .withColumnRenamed("item_id","gauge_id")\
                                  .withColumnRenamed("item_name","gauge_name")\
                                  .withColumnRenamed("rating","gauge_grade")\
                                  .foreachPartition(lambda x:write_hbase1(x,columns,hbase))

    except Exception:
        tb.print_exc()

예제 #26

0

파일 보기

파일: goods_supply.py 프로젝트: ZhiYinZhang/study

def supply_analysis_county():
    # 各区县该卷烟品规订足率
    plm_item = get_plm_item(spark).select("item_id", "item_name")

    area = get_area(spark)

    # sale_center_id与区(list)的映射关系
    county = area.groupBy("sale_center_id") \
        .agg(f.collect_list("county").alias("county")) \
        .select("sale_center_id", "county")

    co_co_line = get_co_co_line(spark, scope=[1, 1], filter="week") \
        .where(col("qty_rsn") > 0) \
        .select("sale_center_id", "item_id", "qty_ord", "qty_rsn","cust_id")

    #只按照档位过滤掉ZZ档位的，状态不过滤
    co_cust = spark.sql(
        "select cust_id from DB2_DB2INST1_CO_CUST where dt=(select max(dt) from DB2_DB2INST1_CO_CUST) and cust_seg!='ZZ'") \
        .where(col("com_id").isin(cities))

    try:
        print(f"{str(dt.now())} 各区县该卷烟品规订足率")
        colName = "county_gauge_reserve"
        reserve_ratio = co_co_line.join(co_cust,"cust_id")\
            .groupBy("sale_center_id", "item_id") \
            .agg(f.sum("qty_ord").alias("qty_ord"), f.sum("qty_rsn").alias("qty_rsn")) \
            .withColumn(colName, col("qty_ord") / col("qty_rsn"))

        columns = [
            "sale_center_id", "county", "gauge_id", "gauge_name",
            "gears_data_marker", colName
        ]
        reserve_ratio.join(county, "sale_center_id") \
            .join(plm_item, "item_id") \
            .withColumn("row", f.concat_ws("_", col("sale_center_id"), col("item_id"))) \
            .withColumn("gears_data_marker", f.lit("1")) \
            .withColumnRenamed("item_id", "gauge_id") \
            .withColumnRenamed("item_name", "gauge_name") \
            .foreachPartition(lambda x: write_hbase1(x, columns, hbase))
    except Exception:
        tb.print_exc()

예제 #27

0

파일 보기

def get_order_similar_cigar():
    #客户订购相似卷烟
    try:
        print(f"{str(dt.now())} 客户订购相似卷烟")
        # origin为要查找的烟，nearest为最相似的卷烟
        similar_cigar = get_client_similar_cigar(spark)

        plm_item = get_plm_item(spark).select("item_id", "item_name")

        columns = [
            "gauge_id", "gauge_name", "gauge_client_like_ciga",
            "ciga_data_marker"
        ]
        json_udf = f.udf(lambda x, y, z: json.dumps(
            {
                "gaugeId": x,
                "gaugeName": y,
                "gaugeSimilarity": z
            },
            ensure_ascii=False))

        df_map = similar_cigar.join(plm_item, col("origin") == col("item_id")) \
            .withColumnRenamed("item_id", "gauge_id") \
            .withColumnRenamed("item_name", "gauge_name") \
            .join(plm_item, col("other") == col("item_id")) \
            .withColumn("item_map", json_udf(col("item_id"), col("item_name"), col("distance")))

        #按照相似度倒序排序
        map_list_sort_udf = f.udf(lambda l, k: sorted(
            l, key=lambda x: json.loads(x)[k], reverse=True))

        df_map.groupBy("gauge_id", "gauge_name") \
            .agg(f.collect_list(col("item_map")).alias("item_list")) \
            .withColumn("gauge_client_like_ciga", map_list_sort_udf(col("item_list"), f.lit("gaugeSimilarity"))) \
            .withColumn("row", col("gauge_id")) \
            .withColumn("ciga_data_marker", f.lit("4")) \
            .select("row", "gauge_id", "gauge_name", "ciga_data_marker", "gauge_client_like_ciga") \
            .foreachPartition(lambda x: write_hbase1(x, columns, hbase))
    except Exception:
        tb.print_exc()

예제 #28

0

파일 보기

파일: macro_index.py 프로젝트: ZhiYinZhang/study

def get_area_ppl():
    print(f"{str(dt.now())}   人口数量")
    try:
        population = get_city_ppl(spark).select("区县", "总人口", "常住人口", "城镇人口",
                                                "农村人口")
        area_code = get_area(spark).where(col("city").rlike("株洲市|邵阳市|岳阳市")) \
                                  .select("city","county", "sale_center_id")
        area_plt = population.join(area_code, col("county") == col("区县"))

        cols = [
            "total_number", "permanent_number", "urban_number",
            "country_number"
        ]

        # 每个区域中心的人口
        sale_center_ppl = area_plt.groupBy("city", "sale_center_id") \
                                  .agg(f.sum("总人口").alias(cols[0]), f.sum("常住人口").alias(cols[1]),
                                         f.sum("城镇人口").alias(cols[2]), f.sum("农村人口").alias(cols[3]))

        sale_center_ppl.foreachPartition(
            lambda x: write_hbase1(x, cols, hbase))
    except Exception:
        tb.print_exc()

예제 #29

0

파일 보기

def get_avg_cons_level():
    try:
        co_cust = get_valid_co_cust(spark).select("cust_id")
        print(f"{str(dt.now())}  零售店订货条均价/消费水平")
        consume_level_df = get_consume_level(spark).select(
            "cust_id", "consume_level")

        # -----------------------获取co_co_01
        co_co_01=get_co_co_01(spark,scope=[1,1],filter="month") \
                .select("com_id", "qty_sum", "amt_sum", "cust_id")

        #每个零售户的订货总量 总订货额
        qty_amt_sum = co_co_01.groupBy("com_id","cust_id")\
            .agg(f.sum("qty_sum").alias("order_sum"), f.sum("amt_sum").alias("amt_sum"))

        #每个零售户的订货条均价
        avg_price=qty_amt_sum.withColumn("avg_price", col("amt_sum")/col("order_sum"))\
                   .select("com_id","cust_id","avg_price")

        avg_consume_level=avg_price.join(consume_level_df,"cust_id")\
                 .withColumn("retail_price_cons",col("avg_price")/col("consume_level"))

        cols = {
            "value": "retail_price_cons",
            "abnormal": "price_cons",
            "mean_plus_3std": "price_std_plus3",
            "mean_minus_3std": "price_std_minu3",
            "mean": "city_price_cons"
        }
        values = list(cols.values())


        is_except(avg_consume_level,cols,["com_id"])\
                        .join(co_cust,"cust_id")\
                        .foreachPartition(lambda x:write_hbase1(x,values,hbase))
    except Exception:
        tb.print_exc()

예제 #30

0

파일 보기

def get_trans_busiArea_index():
    try:
        coordinate = get_coordinate(spark).select("lng", "lat", "types",
                                                  "adcode", "cityname")

        area_code=get_area(spark).withColumnRenamed("区域编码","adcode")\
                        .select("adcode","county","sale_center_id")

        types = {
            "trans_index":
            "(.*地铁站.*)|(.*公交.*)|(.*机场.*)|(.*港口码头.*)",
            "area_index":
            "(.*餐厅.*)|(.*咖啡厅.*)|(.*茶艺馆.*)|(.*甜品店.*)|(.*购物中心.*)|(.*运动场馆.*)|(.*娱乐场所.*)|(.*休闲场所.*)",
        }

        area_coor = coordinate.join(area_code, "adcode")

        for colName in types.keys():
            print(f"{str(dt.now())}   {colName}")
            regex = types[colName]
            try:
                #area_coor 先获取符合条件的服务
                count_df=area_coor.where(col("types").rlike(regex)) \
                    .groupBy("cityname", "sale_center_id") \
                    .count()
                #每个市的最大值
                max_df = count_df.groupBy("cityname").agg(
                    f.max("count").alias("max"))

                count_df.join(max_df, "cityname")\
                    .withColumn(colName,f.round(col("count")/col("max")*5,4)) \
                    .foreachPartition(lambda x: write_hbase1(x, [colName],hbase))
            except Exception:
                tb.print_exc()
    except Exception:
        tb.print_exc()