gwid_hosid_dict[gw_id] = hos_id logger.debug('-->gwid_hosid:' + str(gwid_hosid_dict.__len__())) users = lines.map(lambda x: x[1].split(',')).filter(lambda x: len(x) == 17) \ .map(lambda p: (p[0].strip(), p[1].strip(), p[2].strip(), p[3].strip(), p[4].strip(), \ p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \ p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \ p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), ""))) logger.debug('-->users:' + str(users.count())) schema_string = "id gw_id supp_id user_id user_type " \ "user_name login_time logout_time mac ip " \ "user_agent download_flow upload_flow os browser " \ "ratio batch_no hos_id" fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')] schema = StructType(fields) schema_users = sql_context.applySchema(users, schema) schema_users.registerTempTable("wxcity_userlogin_info") # regist udf sql_context.registerFunction("get_date", lambda x: DateUtil.str_to_date(x).date(), DateType()) sql_context.registerFunction("date_diff", lambda x, k: DateUtil.date_diff(x, k), IntegerType()) sql_context.registerFunction("get_hour", lambda x: DateUtil.str_to_date(x).hour(), IntegerType()) sql_context.registerFunction("to_int", lambda x: int(x), IntegerType()) sql_context.registerFunction("timestamp_diff", lambda x, k: DateUtil.timestamp_diff(x, k), IntegerType()) lines_list = UserLoginRepeatService().exec_file(sql_context, time_begin, time_end) # group by day,hosid,(mac),2, 5, 10, 30, 60 #repeat_list = sc.textFile(ConfigSparkPath.userlogin_repeat_path % time_begin).map(lambda line:line.split('\t')).filter(lambda x:len(x)==8) repeat_list = sc.parallelize(lines_list).map(lambda line:line.split('\t')) schema_string = "day hos_id mac t2 t5 " \
# hosid_count.foreach(print_str) user_top_hosid = hosid_count.groupByKey().mapValues(list).sortByKey() \ .map(topcount) # (u'00:66:4B:9B:0F:C9', u'') # user_top_hosid.foreach(print_str) # user,days,count days_count = parts.map(convert_day).groupByKey().mapValues(set).map(compute_mark) #(u'95:15:DF:EE:41:E9', u'\u5b558\u6708|\u5b55\u524d\u671f', u'2014-04-06') #days_count.foreach(print_str) # join:mac,mark,hosid mac_mark_hosid = days_count.join(user_top_hosid).map(generate_ret) #mac_mark_hosid.take(10) #mac_mark_hosid.foreach(print_str) fields = [ StructField('user', StringType(), True), StructField('stage', StringType(), True), StructField('conceive', StringType(), True), StructField('area', StringType(), True) ] schema = StructType(fields) dest = sqlContext.applySchema(mac_mark_hosid, schema) dest.registerTempTable("user_profile") # combine partition dest.coalesce(10).write.parquet(output,'overwrite') sc.stop()
kpi_min, kpi_max, input_l3, input_l5, output_l5, start_date, end_date = args.kpi_min, args.kpi_max, args.input_l3, args.input_l5, args.output_l5, args.start_date, args.end_date if kpi_min and kpi_max and input_l5 and start_date and end_date: conf = SparkConf().setAppName("SparkSQL Evaluation Level5") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) l3 = sc.textFile(input_l3).coalesce(4) d3 = l3.map(lambda z: z.replace('"', '')).map(lambda z: z.split( ',')).map(lambda p: (str(p[0]), float(p[1]), float(p[2]))) field3 = [ StructField("CONSUMER_ID", StringType(), False), StructField("KPI_1", FloatType(), True), StructField("KPI_2", FloatType(), True) ] s3 = StructType(field3) schema3 = sqlContext.applySchema(d3, s3) schema3.registerTempTable("Level3") l5 = sc.textFile(input_l5).coalesce(24) d5 = l5.map(lambda z: z.replace('"', '')).map( lambda z: z.split(',')).map(lambda p: (p[0], p[ 1], datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S").date(), int(p[3]))) field5 = [ StructField("CONSUMER_ID", StringType(), False), StructField("CAMPAIGN_NAME", StringType(), True), StructField("EVENT_DATE", DateType(), True), StructField("EVENT_TYPE_ID", IntegerType(), True) ] s5 = StructType(field5) schema5 = sqlContext.applySchema(d5, s5)
args = parser.parse_args() kpi_min, kpi_max, input_l3, output_l3, start_date, end_date = args.kpi_min, args.kpi_max, args.input_l3, args.output_l3, args.start_date, args.end_date if kpi_min and kpi_max and input_l3 and output_l3 and start_date and end_date: conf = SparkConf().setAppName("SparkSQL Evaluation Level3") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) l3 = sc.textFile(input_l3).coalesce(4) d3 = l3.map(lambda z: z.replace('"', '')).map(lambda z: z.split( ',')).map(lambda p: (str(p[0]), float(p[1]), float(p[2]))) field3 = [ StructField("CONSUMER_ID", StringType(), False), StructField("KPI_1", FloatType(), True), StructField("KPI_2", FloatType(), True) ] s3 = StructType(field3) schema3 = sqlContext.applySchema(d3, s3) schema3.registerTempTable("Level3") sqlContext.cacheTable("Level3") query3 = sqlContext.sql("SELECT SEGMENT_ID, SLAB, COUNT(*), SUM(KPI_1), MIN(KPI_1), MAX(KPI_1), AVG(KPI_1) FROM \ (SELECT KPI_1, CASE WHEN KPI_1 >= " + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN 1 ELSE 0 END SEGMENT_ID, \ CASE WHEN KPI_1 >= " + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN '" + str(kpi_min) + " - " + str(kpi_max) + \ "' ELSE NULL END SLAB FROM Level3) DUMP \ WHERE SLAB IS NOT NULL \ GROUP BY SLAB, SEGMENT_ID \ ORDER BY SEGMENT_ID" ) query3.coalesce(1).saveAsTextFile(output_l3)