.option('useSSL', False) \ .option("url", jdbcUrl) \ .option("dbtable", TABLE_RATINGS) \ .option("driver", 'com.mysql.jdbc.Driver') \ .load() dfMovies = sqlContext.read.format('jdbc') \ .option('useSSL', False) \ .option("url", jdbcUrl) \ .option("dbtable", TABLE_MOVIES) \ .option("driver", 'com.mysql.jdbc.Driver') \ .load() dfRates.registerTempTable('Rates') sqlContext.cacheTable('Rates') logger.info("Datasets Loaded...") rank = 8 seed = 5L iterations = 10 regularization_parameter = 0.1 logger.info("Training the ALS model...") model = ALS.train(dfRates.rdd.map(lambda r: (int(r[0]), int(r[1]), r[2])).cache(), rank=rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) logger.info("ALS model built!") # Calculate all predictions predictions = model.recommendProductsForUsers(10) \
TABLE_RATINGS = 'RATINGS' TABLE_MOVIES = 'MOVIES' TABLE_RECOMMENDATIONS = 'RECOMMENDATIONS' t0 = time() print "Reading data from postgresql..." dfRates = sqlContext.read.format('jdbc') \ .option('useSSL', False) \ .option("url", jdbcUrl) \ .option("dbtable", TABLE_RATINGS) \ .option("driver", jdbcDriver) \ .load() dfRates.registerTempTable('Ratings') sqlContext.cacheTable('Ratings') tt = time() - t0 print "Data is loaded in %s seconds" % round(tt,3) rank = 8 seed = 5L iterations = 10 regularization_parameter = 0.1 t0 = time() print "Training the ALS model..." model = ALS.train(dfRates.rdd.map(lambda r: (int(r[0]), int(r[1]), r[2])).cache(), rank=rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) tt = time() - t0 print "ALS model built!" print "New model trained in %s seconds" % round(tt,3)
s5 = StructType(field5) schema5 = sqlContext.applySchema(d5, s5) schema5.registerTempTable("Level5") level3_1 = sqlContext.sql("SELECT CONSUMER_ID, KPI_1 FROM Level3 \ WHERE KPI_1 >= " + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " ") data3_1 = level3_1.map(lambda z: (str(z[0]), float(z[1]))) field3_1 = [ StructField("CONSUMER_ID", StringType(), False), StructField("KPI", FloatType(), True) ] s3_1 = StructType(field3_1) schema3_1 = sqlContext.applySchema(data3_1, s3_1) schema3_1.registerTempTable("Level3_1") sqlContext.cacheTable("Level3_1") level5_1 = sqlContext.sql( "SELECT CONSUMER_ID, CAMPAIGN_NAME, EVENT_TYPE_ID FROM Level5 WHERE EVENT_DATE BETWEEN '" + start_date + "' AND '" + end_date + "'") data5_1 = level5_1.map(lambda z: (str(z[0]), str(z[1]), int(z[2]))) field5_1 = [ StructField("CONSUMER_ID", StringType(), False), StructField("CAMPAIGN_NAME", StringType(), False), StructField("EVENT_TYPE_ID", IntegerType(), False) ] s5_1 = StructType(field5_1) schema5_1 = sqlContext.applySchema(data5_1, s5_1) schema5_1.registerTempTable("Level5_1") sqlContext.cacheTable("Level5_1")
args = parser.parse_args() kpi_min, kpi_max, input_l3, output_l3, start_date, end_date = args.kpi_min, args.kpi_max, args.input_l3, args.output_l3, args.start_date, args.end_date if kpi_min and kpi_max and input_l3 and output_l3 and start_date and end_date: conf = SparkConf().setAppName("SparkSQL Evaluation Level3") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) l3 = sc.textFile(input_l3).coalesce(4) d3 = l3.map(lambda z: z.replace('"', '')).map(lambda z: z.split( ',')).map(lambda p: (str(p[0]), float(p[1]), float(p[2]))) field3 = [ StructField("CONSUMER_ID", StringType(), False), StructField("KPI_1", FloatType(), True), StructField("KPI_2", FloatType(), True) ] s3 = StructType(field3) schema3 = sqlContext.applySchema(d3, s3) schema3.registerTempTable("Level3") sqlContext.cacheTable("Level3") query3 = sqlContext.sql("SELECT SEGMENT_ID, SLAB, COUNT(*), SUM(KPI_1), MIN(KPI_1), MAX(KPI_1), AVG(KPI_1) FROM \ (SELECT KPI_1, CASE WHEN KPI_1 >= " + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN 1 ELSE 0 END SEGMENT_ID, \ CASE WHEN KPI_1 >= " + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN '" + str(kpi_min) + " - " + str(kpi_max) + \ "' ELSE NULL END SLAB FROM Level3) DUMP \ WHERE SLAB IS NOT NULL \ GROUP BY SLAB, SEGMENT_ID \ ORDER BY SEGMENT_ID" ) query3.coalesce(1).saveAsTextFile(output_l3)