예제 #1
0
    .option('useSSL', False) \
    .option("url", jdbcUrl) \
    .option("dbtable", TABLE_RATINGS) \
    .option("driver", 'com.mysql.jdbc.Driver') \
    .load()

dfMovies = sqlContext.read.format('jdbc') \
    .option('useSSL', False) \
    .option("url", jdbcUrl) \
    .option("dbtable", TABLE_MOVIES) \
    .option("driver", 'com.mysql.jdbc.Driver') \
    .load()

dfRates.registerTempTable('Rates')

sqlContext.cacheTable('Rates')

logger.info("Datasets Loaded...")

rank = 8
seed = 5L
iterations = 10
regularization_parameter = 0.1

logger.info("Training the ALS model...")
model = ALS.train(dfRates.rdd.map(lambda r: (int(r[0]), int(r[1]), r[2])).cache(), rank=rank, seed=seed,
                  iterations=iterations, lambda_=regularization_parameter)
logger.info("ALS model built!")

# Calculate all predictions
predictions = model.recommendProductsForUsers(10) \
예제 #2
0
TABLE_RATINGS = 'RATINGS'
TABLE_MOVIES = 'MOVIES'
TABLE_RECOMMENDATIONS = 'RECOMMENDATIONS'

t0 = time()
print "Reading data from postgresql..."
dfRates = sqlContext.read.format('jdbc') \
    .option('useSSL', False) \
    .option("url", jdbcUrl) \
    .option("dbtable", TABLE_RATINGS) \
    .option("driver", jdbcDriver) \
    .load()

dfRates.registerTempTable('Ratings')
sqlContext.cacheTable('Ratings')
tt = time() - t0
print "Data is loaded in %s seconds" % round(tt,3)

rank = 8
seed = 5L
iterations = 10
regularization_parameter = 0.1

t0 = time()
print "Training the ALS model..."
model = ALS.train(dfRates.rdd.map(lambda r: (int(r[0]), int(r[1]), r[2])).cache(), rank=rank, seed=seed,
                  iterations=iterations, lambda_=regularization_parameter)
tt = time() - t0
print "ALS model built!"
print "New model trained in %s seconds" % round(tt,3)
예제 #3
0
        s5 = StructType(field5)
        schema5 = sqlContext.applySchema(d5, s5)
        schema5.registerTempTable("Level5")

        level3_1 = sqlContext.sql("SELECT CONSUMER_ID, KPI_1 FROM Level3 \
                            WHERE KPI_1 >= " + str(kpi_min) +
                                  " AND KPI_1 <= " + str(kpi_max) + " ")
        data3_1 = level3_1.map(lambda z: (str(z[0]), float(z[1])))
        field3_1 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("KPI", FloatType(), True)
        ]
        s3_1 = StructType(field3_1)
        schema3_1 = sqlContext.applySchema(data3_1, s3_1)
        schema3_1.registerTempTable("Level3_1")
        sqlContext.cacheTable("Level3_1")

        level5_1 = sqlContext.sql(
            "SELECT CONSUMER_ID, CAMPAIGN_NAME, EVENT_TYPE_ID FROM Level5 WHERE EVENT_DATE BETWEEN '"
            + start_date + "' AND '" + end_date + "'")
        data5_1 = level5_1.map(lambda z: (str(z[0]), str(z[1]), int(z[2])))
        field5_1 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("CAMPAIGN_NAME", StringType(), False),
            StructField("EVENT_TYPE_ID", IntegerType(), False)
        ]
        s5_1 = StructType(field5_1)
        schema5_1 = sqlContext.applySchema(data5_1, s5_1)
        schema5_1.registerTempTable("Level5_1")
        sqlContext.cacheTable("Level5_1")
    args = parser.parse_args()

    kpi_min, kpi_max, input_l3, output_l3, start_date, end_date = args.kpi_min, args.kpi_max, args.input_l3, args.output_l3, args.start_date, args.end_date

    if kpi_min and kpi_max and input_l3 and output_l3 and start_date and end_date:
        conf = SparkConf().setAppName("SparkSQL Evaluation Level3")
        sc = SparkContext(conf=conf)
        sqlContext = SQLContext(sc)
        l3 = sc.textFile(input_l3).coalesce(4)
        d3 = l3.map(lambda z: z.replace('"', '')).map(lambda z: z.split(
            ',')).map(lambda p: (str(p[0]), float(p[1]), float(p[2])))
        field3 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("KPI_1", FloatType(), True),
            StructField("KPI_2", FloatType(), True)
        ]
        s3 = StructType(field3)
        schema3 = sqlContext.applySchema(d3, s3)
        schema3.registerTempTable("Level3")
        sqlContext.cacheTable("Level3")

        query3 = sqlContext.sql("SELECT SEGMENT_ID, SLAB, COUNT(*), SUM(KPI_1), MIN(KPI_1), MAX(KPI_1), AVG(KPI_1) FROM \
                (SELECT KPI_1, CASE WHEN KPI_1 >= "                                                        + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN 1 ELSE 0 END SEGMENT_ID, \
                CASE WHEN KPI_1 >= "                                         + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN '" + str(kpi_min) + " - " + str(kpi_max) + \
                    "' ELSE NULL END SLAB FROM Level3) DUMP \
                WHERE SLAB IS NOT NULL \
                GROUP BY SLAB, SEGMENT_ID \
                ORDER BY SEGMENT_ID"                                        )

        query3.coalesce(1).saveAsTextFile(output_l3)