class MusicCollaborativeFiltering:
    def __init__(self):
        self.sparkConfig = SparkConf()\
            .setMaster("local[4]")\
            .setAppName("MCF")\
            .set("spark.cassandra.connection.host", "127.0.0.1")\
            .set("spark.cassandra.input.consistency.level", "LOCAL_ONE")
        self.sparkContext = CassandraSparkContext(conf=self.sparkConfig)
        self.rank = 10
        self.numIteration = 10
        self.numberOfPreds = 10
        self.cluster = Cluster()
        self.session = self.cluster.connect("music_recommendation")
        self.rawData = self.session.execute("SELECT uid, song_id, payload "
                                            "FROM user_event "
                                            "WHERE action_type='rate'")
        self.session.execute("CREATE TABLE IF NOT EXISTS result_cf ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")

    def run(self):
        """ This function will run the collaborative filtering algorithm and
        get the predictions for the system.
        """
        userEventData = handle_raw_data(self.rawData)
        dist_data = self.sparkContext.parallelize(userEventData)
        userMapIdBase, musicMapIdBase = get_id_based_maps(dist_data)

        user_event = convert_to_index_based(dist_data, userMapIdBase,
                                            musicMapIdBase)

        ratings = convert_to_rating_type(user_event)

        test_data = user_event.map(lambda a: (a[0], a[1]))
        model = ALS.train(ratings, self.rank, self.numIteration)
        predictions = model.predictAll(test_data).map(lambda r:
                                                      ((r[0], r[1]), r[2]))
        ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(
            predictions)
        MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
        self.sparkContext.stop()
        return MSE
예제 #2
0
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext #, HiveContext
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testTogether")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
    dtf = loc.localize(datetime.strptime(_from, "%Y-%d-%m %H:%M"))
예제 #3
0
    customSchema = StructType([ \
     StructField("ZipCode", StringType(), True), \
 StructField("Pop", IntegerType(), True), \
 StructField("MeanIncome", IntegerType(), True), \
        StructField("NumberOfDoctors", IntegerType(), True), \
 StructField("Ratio", FloatType(), True)])

    popDoctorRatioQueryDF = sqlContext.createDataFrame(popDoctorRatioQueryRDD,
                                                       customSchema)

    dfJSONRDD = popDoctorRatioQueryDF.toJSON().collect()

    mergedJSON = []
    for row in dfJSONRDD:
        mergedJSON.append(row)

    with open("/home/cs179g/json/popDoctorRatio.json", "wb") as outfile:
        json.dump(mergedJSON, outfile)

    popDoctorRatioQueryRDD.map(
        lambda row: {
            'zipcode': row[0],
            'pop': row[1],
            'meanincome': row[2],
            'numberofdoctors': row[3],
            'ratio': row[4]
        }).saveToCassandra(keyspace='hospitals', table='pop_doctor_ratio')

    sc.stop()
            formatted_rdd_per_user = recomm_per_user.map(lambda row: (t_id,row[0],row[1]))
            all_results_per_user = all_results_per_user.union(formatted_rdd_per_user)
            formatted_rdd_per_item = recomm_per_item.map(lambda row: (t_id,row[0],row[1]))
            all_results_per_item = all_results_per_item.union(formatted_rdd_per_item)
                
        if SAVE_DATA_TO_DB:
            save_to_mongo(all_results_per_user,dbtable_out_per_user)
            save_to_mongo(all_results_per_item,dbtable_out_per_item)
        else:
            print("%d recommendations per user:"******"%d recommendations per item:" % num_to_recomm_per_item)
            print(all_results_per_user.collect())
    else:
        data_rdd = sc.textFile(test_file).map(lambda l: l.split(','))
        recomm_per_user,recomm_per_item = test(data_rdd, num_to_recomm_per_user,num_to_recomm_per_item)

        if SAVE_DATA_TO_DB:
            save_to_mongo(recomm_per_user,dbtable_out_per_user)
            save_to_mongo(recomm_per_item,dbtable_out_per_item)
        else:
            print("%d recommendations per user:"******"%d recommendations per item:" % num_to_recomm_per_item)
            print(recomm_per_item.collect())
            
    elapsed = (time.time() - t0)
    sc.stop()
    print ("\nIt took %.2fsec to complete" % elapsed) 

예제 #5
0
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext #, HiveContext
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
    dtf = loc.localize(datetime.strptime(_from, "%Y-%d-%m %H:%M"))