Пример #1
0
        gwid_hosid_dict[gw_id] = hos_id
    logger.debug('-->gwid_hosid:' + str(gwid_hosid_dict.__len__()))
    users = lines.map(lambda x: x[1].split(',')).filter(lambda x: len(x) == 17) \
        .map(lambda p: (p[0].strip(), p[1].strip(), p[2].strip(), p[3].strip(), p[4].strip(), \
                        p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \
                        p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \
                        p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), "")))
    logger.debug('-->users:' + str(users.count()))
    schema_string = "id gw_id supp_id user_id user_type " \
                    "user_name login_time logout_time mac ip " \
                    "user_agent download_flow upload_flow os browser " \
                    "ratio batch_no hos_id"

    fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
    schema = StructType(fields)
    schema_users = sql_context.applySchema(users, schema)
    schema_users.registerTempTable("wxcity_userlogin_info")

    # regist udf
    sql_context.registerFunction("get_date", lambda x: DateUtil.str_to_date(x).date(), DateType())
    sql_context.registerFunction("date_diff", lambda x, k: DateUtil.date_diff(x, k), IntegerType())
    sql_context.registerFunction("get_hour", lambda x: DateUtil.str_to_date(x).hour(), IntegerType())
    sql_context.registerFunction("to_int", lambda x: int(x), IntegerType())
    sql_context.registerFunction("timestamp_diff", lambda x, k: DateUtil.timestamp_diff(x, k), IntegerType())

    lines_list = UserLoginRepeatService().exec_file(sql_context, time_begin, time_end)

    # group by day,hosid,(mac),2, 5, 10, 30, 60
    #repeat_list = sc.textFile(ConfigSparkPath.userlogin_repeat_path % time_begin).map(lambda line:line.split('\t')).filter(lambda x:len(x)==8)
    repeat_list = sc.parallelize(lines_list).map(lambda line:line.split('\t'))
    schema_string = "day hos_id mac t2 t5 " \
    # hosid_count.foreach(print_str)

    user_top_hosid = hosid_count.groupByKey().mapValues(list).sortByKey() \
        .map(topcount)
    # (u'00:66:4B:9B:0F:C9', u'')
    # user_top_hosid.foreach(print_str)

    # user,days,count
    days_count = parts.map(convert_day).groupByKey().mapValues(set).map(compute_mark)
    #(u'95:15:DF:EE:41:E9', u'\u5b558\u6708|\u5b55\u524d\u671f', u'2014-04-06')

    #days_count.foreach(print_str)
    # join:mac,mark,hosid
    mac_mark_hosid = days_count.join(user_top_hosid).map(generate_ret)
    #mac_mark_hosid.take(10)
    #mac_mark_hosid.foreach(print_str)
    fields = [
        StructField('user', StringType(), True),
        StructField('stage', StringType(), True),
        StructField('conceive', StringType(), True),
        StructField('area', StringType(), True)
        ]
    schema = StructType(fields)
    dest = sqlContext.applySchema(mac_mark_hosid, schema)
    dest.registerTempTable("user_profile")
    # combine partition
    dest.coalesce(10).write.parquet(output,'overwrite')


    sc.stop()
Пример #3
0
    kpi_min, kpi_max, input_l3, input_l5, output_l5, start_date, end_date = args.kpi_min, args.kpi_max, args.input_l3, args.input_l5, args.output_l5, args.start_date, args.end_date

    if kpi_min and kpi_max and input_l5 and start_date and end_date:
        conf = SparkConf().setAppName("SparkSQL Evaluation Level5")
        sc = SparkContext(conf=conf)
        sqlContext = SQLContext(sc)
        l3 = sc.textFile(input_l3).coalesce(4)
        d3 = l3.map(lambda z: z.replace('"', '')).map(lambda z: z.split(
            ',')).map(lambda p: (str(p[0]), float(p[1]), float(p[2])))
        field3 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("KPI_1", FloatType(), True),
            StructField("KPI_2", FloatType(), True)
        ]
        s3 = StructType(field3)
        schema3 = sqlContext.applySchema(d3, s3)
        schema3.registerTempTable("Level3")

        l5 = sc.textFile(input_l5).coalesce(24)
        d5 = l5.map(lambda z: z.replace('"', '')).map(
            lambda z: z.split(',')).map(lambda p: (p[0], p[
                1], datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S").date(),
                                                   int(p[3])))
        field5 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("CAMPAIGN_NAME", StringType(), True),
            StructField("EVENT_DATE", DateType(), True),
            StructField("EVENT_TYPE_ID", IntegerType(), True)
        ]
        s5 = StructType(field5)
        schema5 = sqlContext.applySchema(d5, s5)
    args = parser.parse_args()

    kpi_min, kpi_max, input_l3, output_l3, start_date, end_date = args.kpi_min, args.kpi_max, args.input_l3, args.output_l3, args.start_date, args.end_date

    if kpi_min and kpi_max and input_l3 and output_l3 and start_date and end_date:
        conf = SparkConf().setAppName("SparkSQL Evaluation Level3")
        sc = SparkContext(conf=conf)
        sqlContext = SQLContext(sc)
        l3 = sc.textFile(input_l3).coalesce(4)
        d3 = l3.map(lambda z: z.replace('"', '')).map(lambda z: z.split(
            ',')).map(lambda p: (str(p[0]), float(p[1]), float(p[2])))
        field3 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("KPI_1", FloatType(), True),
            StructField("KPI_2", FloatType(), True)
        ]
        s3 = StructType(field3)
        schema3 = sqlContext.applySchema(d3, s3)
        schema3.registerTempTable("Level3")
        sqlContext.cacheTable("Level3")

        query3 = sqlContext.sql("SELECT SEGMENT_ID, SLAB, COUNT(*), SUM(KPI_1), MIN(KPI_1), MAX(KPI_1), AVG(KPI_1) FROM \
                (SELECT KPI_1, CASE WHEN KPI_1 >= "                                                        + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN 1 ELSE 0 END SEGMENT_ID, \
                CASE WHEN KPI_1 >= "                                         + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN '" + str(kpi_min) + " - " + str(kpi_max) + \
                    "' ELSE NULL END SLAB FROM Level3) DUMP \
                WHERE SLAB IS NOT NULL \
                GROUP BY SLAB, SEGMENT_ID \
                ORDER BY SEGMENT_ID"                                        )

        query3.coalesce(1).saveAsTextFile(output_l3)