class MusicCollaborativeFiltering: def __init__(self): self.sparkConfig = SparkConf()\ .setMaster("local[4]")\ .setAppName("MCF")\ .set("spark.cassandra.connection.host", "127.0.0.1")\ .set("spark.cassandra.input.consistency.level", "LOCAL_ONE") self.sparkContext = CassandraSparkContext(conf=self.sparkConfig) self.rank = 10 self.numIteration = 10 self.numberOfPreds = 10 self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") self.rawData = self.session.execute("SELECT uid, song_id, payload " "FROM user_event " "WHERE action_type='rate'") self.session.execute("CREATE TABLE IF NOT EXISTS result_cf (" "uid text PRIMARY KEY," "recommendations list<text>);") def run(self): """ This function will run the collaborative filtering algorithm and get the predictions for the system. """ userEventData = handle_raw_data(self.rawData) dist_data = self.sparkContext.parallelize(userEventData) userMapIdBase, musicMapIdBase = get_id_based_maps(dist_data) user_event = convert_to_index_based(dist_data, userMapIdBase, musicMapIdBase) ratings = convert_to_rating_type(user_event) test_data = user_event.map(lambda a: (a[0], a[1])) model = ALS.train(ratings, self.rank, self.numIteration) predictions = model.predictAll(test_data).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join( predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() self.sparkContext.stop() return MSE
from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext #, HiveContext #from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testTogether") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin') dtf = loc.localize(datetime.strptime(_from, "%Y-%d-%m %H:%M"))
customSchema = StructType([ \ StructField("ZipCode", StringType(), True), \ StructField("Pop", IntegerType(), True), \ StructField("MeanIncome", IntegerType(), True), \ StructField("NumberOfDoctors", IntegerType(), True), \ StructField("Ratio", FloatType(), True)]) popDoctorRatioQueryDF = sqlContext.createDataFrame(popDoctorRatioQueryRDD, customSchema) dfJSONRDD = popDoctorRatioQueryDF.toJSON().collect() mergedJSON = [] for row in dfJSONRDD: mergedJSON.append(row) with open("/home/cs179g/json/popDoctorRatio.json", "wb") as outfile: json.dump(mergedJSON, outfile) popDoctorRatioQueryRDD.map( lambda row: { 'zipcode': row[0], 'pop': row[1], 'meanincome': row[2], 'numberofdoctors': row[3], 'ratio': row[4] }).saveToCassandra(keyspace='hospitals', table='pop_doctor_ratio') sc.stop()
formatted_rdd_per_user = recomm_per_user.map(lambda row: (t_id,row[0],row[1])) all_results_per_user = all_results_per_user.union(formatted_rdd_per_user) formatted_rdd_per_item = recomm_per_item.map(lambda row: (t_id,row[0],row[1])) all_results_per_item = all_results_per_item.union(formatted_rdd_per_item) if SAVE_DATA_TO_DB: save_to_mongo(all_results_per_user,dbtable_out_per_user) save_to_mongo(all_results_per_item,dbtable_out_per_item) else: print("%d recommendations per user:"******"%d recommendations per item:" % num_to_recomm_per_item) print(all_results_per_user.collect()) else: data_rdd = sc.textFile(test_file).map(lambda l: l.split(',')) recomm_per_user,recomm_per_item = test(data_rdd, num_to_recomm_per_user,num_to_recomm_per_item) if SAVE_DATA_TO_DB: save_to_mongo(recomm_per_user,dbtable_out_per_user) save_to_mongo(recomm_per_item,dbtable_out_per_item) else: print("%d recommendations per user:"******"%d recommendations per item:" % num_to_recomm_per_item) print(recomm_per_item.collect()) elapsed = (time.time() - t0) sc.stop() print ("\nIt took %.2fsec to complete" % elapsed)
from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext #, HiveContext #from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin') dtf = loc.localize(datetime.strptime(_from, "%Y-%d-%m %H:%M"))