예제 #1
0
def run_driver(keyspace, table, cass_host):
    conf = SparkConf().setAppName("PySpark Cassandra Sample Driver")
    conf.set("spark.cassandra.connection.host", cass_host)
    sc = CassandraSparkContext(conf=conf)

    # Read some data from Cassandra
    pixels = sc.cassandraTable(keyspace, table)
    print pixels.first()

    # Count unique visitors, notice that the data returned by Cassandra is
    # a dict-like, you can access partition, clustering keys as well as
    # columns by name. CQL collections: lists, sets and maps are converted
    # to proper Python data types
    visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\
                .distinct()
    print "Visitors: {:,}".format(visitors.count())

    # Insert some new pixels into the table
    pixels = ({
        "customer_id": "example.com",
        "url": "http://example.com/article1/",
        "hour": dt.datetime(2014, 1, 2, 1),
        "ts": dt.datetime(2014, 1, 2, 1, 8, 23),
        "pixel_id": str(uuid4()),
        "data": {
            "visitor_id": "xyz"
        }
    }, )
    saveToCassandra(sc.parallelize(pixels), keyspace, table)
    print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def run_driver(keyspace, table):
    conf = SparkConf().setAppName("PySpark Cassandra Sample Driver")
    conf.set("spark.cassandra.connection.host", "127.0.0.1")
    sc = CassandraSparkContext(conf=conf)

    # Read some data from Cassandra
    pixels = sc.cassandraTable(keyspace, table)
    print pixels.first()

    # Count unique visitors, notice that the data returned by Cassandra is
    # a dict-like, you can access partition, clustering keys as well as
    # columns by name. CQL collections: lists, sets and maps are converted
    # to proper Python data types
    visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\
                .distinct()
    print "Visitors: {:,}".format(visitors.count())

    # Insert some new pixels into the table
    pixels = (
        {
            "customer_id": "example.com",
            "url": "http://example.com/article1/",
            "hour": dt.datetime(2014, 1, 2, 1),
            "ts": dt.datetime(2014, 1, 2, 1, 8, 23),
            "pixel_id": str(uuid4()),
            "data": {"visitor_id": "xyz"}
        },
    )
    saveToCassandra(sc.parallelize(pixels), keyspace, table)
    print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
예제 #3
0
class Popularity(object):
    def __init__(self):
        self.spark_config = SparkConf()\
            .setMaster("local[4]")\
            .setAppName("Popularity")\
            .set("spark.cassandra.connection.host", "127.0.0.1")
        self.sparkContext = CassandraSparkContext(conf=self.spark_config)
        self.cluster = Cluster()
        self.session = self.cluster.connect("music_recommendation")
        self.raw_data = self.session.execute("SELECT song_id, timestamp "
                                             "FROM user_event "
                                             "WHERE action_type='listen';")
        self.session.execute("DROP TABLE IF EXISTS result_popularity ;")
        self.session.execute("CREATE TABLE IF NOT EXISTS result_popularity ("
                             "sid text PRIMARY KEY,"
                             "rank int);")
        self.current_year = datetime.datetime.now().year
        self.current_month = datetime.datetime.now().month

    def _compare_date_time(self, month, year):
        if self.current_year == year:
            if self.current_month == month:
                return True

        else:
            return False

    def _handle_raw_data(self):
        new_data_set = list()
        for row in self.raw_data:
            month = row.timestamp.month
            year = row.timestamp.year
            if self._compare_date_time(month, year):
                new_tuple = tuple([row.song_id, 1])
                new_data_set.append(new_tuple)

        return new_data_set

    def calculate(self):
        dist_data = self.sparkContext.parallelize(self._handle_raw_data())
        counts = dist_data.reduceByKey(lambda a, b: a + b)
        counts = counts.sortBy(lambda a: a[1], ascending=False).take(10)
        result = self.sparkContext.parallelize(counts)
        result.saveToCassandra("music_recommendation", "result_popularity")

        print result.collect()
class MusicCollaborativeFiltering:
    def __init__(self):
        self.sparkConfig = SparkConf()\
            .setMaster("local[4]")\
            .setAppName("MCF")\
            .set("spark.cassandra.connection.host", "127.0.0.1")\
            .set("spark.cassandra.input.consistency.level", "LOCAL_ONE")
        self.sparkContext = CassandraSparkContext(conf=self.sparkConfig)
        self.rank = 10
        self.numIteration = 10
        self.numberOfPreds = 10
        self.cluster = Cluster()
        self.session = self.cluster.connect("music_recommendation")
        self.rawData = self.session.execute("SELECT uid, song_id, payload "
                                            "FROM user_event "
                                            "WHERE action_type='rate'")
        self.session.execute("CREATE TABLE IF NOT EXISTS result_cf ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")

    def run(self):
        """ This function will run the collaborative filtering algorithm and
        get the predictions for the system.
        """
        userEventData = handle_raw_data(self.rawData)
        dist_data = self.sparkContext.parallelize(userEventData)
        userMapIdBase, musicMapIdBase = get_id_based_maps(dist_data)

        user_event = convert_to_index_based(dist_data, userMapIdBase,
                                            musicMapIdBase)

        ratings = convert_to_rating_type(user_event)

        test_data = user_event.map(lambda a: (a[0], a[1]))
        model = ALS.train(ratings, self.rank, self.numIteration)
        predictions = model.predictAll(test_data).map(lambda r:
                                                      ((r[0], r[1]), r[2]))
        ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(
            predictions)
        MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
        self.sparkContext.stop()
        return MSE
예제 #5
0
class MusicCollaborativeFiltering:
    def __init__(self):
        self.sparkConfig = SparkConf()\
            .setMaster("local[4]")\
            .setAppName("MCF")\
            .set("spark.cassandra.connection.host", "127.0.0.1")\
            .set("spark.cassandra.input.consistency.level", "LOCAL_ONE")
        self.sparkContext = CassandraSparkContext(conf=self.sparkConfig)
        self.rank = 10
        self.numIteration = 10
        self.numberOfPreds = 10
        self.cluster = Cluster()
        self.session = self.cluster.connect("music_recommendation")
        self.rawData = self.session.execute("SELECT uid, song_id, payload "
                                            "FROM user_event "
                                            "WHERE action_type='rate'")
        self.session.execute("CREATE TABLE IF NOT EXISTS result_cf ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")

    def run(self):
        """ This function will run the collaborative filtering algorithm and
        get the predictions for the system.
        """
        userEventData = handle_raw_data(self.rawData)
        dist_data = self.sparkContext.parallelize(userEventData)
        userMapIdBase, musicMapIdBase = get_id_based_maps(dist_data)

        user_event = convert_to_index_based(dist_data, userMapIdBase,
                                            musicMapIdBase)

        nonRatedUserMusic = get_non_rated_user_music(user_event)
        ratings = convert_to_rating_type(user_event)

        model = ALS.train(ratings, self.rank, self.numIteration)

        predictions = model.predictAll(nonRatedUserMusic)
        predictions = get_final_result(self.numberOfPreds, predictions,
                                       userMapIdBase, musicMapIdBase)
        predictions.saveToCassandra("music_recommendation", "result_cf",
                                    {"uid", "recommendations"})
    
    col_tenant_id = 1
    col_user_id = 2
    col_item_id = 3

    num_to_recomm_per_user = 10
    num_to_recomm_per_item = 10
    
    
    conf = SparkConf().setAppName("PysparkCollaborativeFiltering").set("spark.cassandra.connection.host", spark_cassandra_connection_host)
    sc = CassandraSparkContext(conf=conf)
    sc.setCheckpointDir('checkpoint/')
    data = sc.cassandraTable("mykeyspace", "transactions",row_format=1).collect() # row_format: tuple
    # (id, tenant_id, user_id, item_id)
    tenant_ids = set(list(map(lambda x:x[col_tenant_id],data)))
    data_rdd = sc.parallelize(data)
    # data_rdd = sc.parallelize(data).map(list)
    
    all_results_per_user = sc.emptyRDD()
    all_results_per_item = sc.emptyRDD()
    
    for t_id in tenant_ids:
        print("\nComputing recommendation for tenant {}...\n".format(t_id))
        per_tenant_rdd = data_rdd.filter(
            lambda x: x[col_tenant_id] == t_id).map(
            lambda l: ((l[col_user_id],l[col_item_id]),1.0)).reduceByKey(
            lambda x,y: x + y).map(
            lambda x: (x[0][0],x[0][1],x[1]))
        recomm_per_user,recomm_per_item = recomm_cf.TrainAndComputeRecommendation(sc, per_tenant_rdd,
                                                                        num_to_recomm_per_user,
                                                                        num_to_recomm_per_item)
        print("Usage: spark-calculate-pageview", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
	.setAppName("spark-calculate-pageview") \
	.set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    i = 1511740800
    while i <= 1514505600:
    # while True:
        date_temp = i
        i = i + 86400
        # current_date = getGMT()
        # future_date = getNextGMT()
        rdd = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","location_path")\
                .filter(lambda x: date_temp <= int(x['m_date']) < i)
        if rdd.isEmpty() == False:
            x = rdd.toDF().groupBy(['location_path']).count()
            # x.show()
            array = []
            for row in x.collect():
                x = {
                    'location_path': row['location_path'], 
                    'm_date': date_temp, 
                    'count':row['count'],
                    'bucket':5}
                array.append(x)     
            result = sc.parallelize(array)
            result.saveToCassandra('web_analytic','page_view_report')
            break
            pass
예제 #8
0
        current_date = getGMT()
        future_date = getNextGMT()
        # date_temp = i
        # i = i + 86400
        raw = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","userid","fsa","fsid","location_path")
        if raw.isEmpty() == False:
            df = raw.toDF()
            current_day = df.filter( df.m_date >= current_date ).filter(df.m_date < future_date).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid')
            previous_day =  df.filter(df.m_date < current_date).select('fsa','fsid')
            # current_day = df.filter( df.m_date >= date_temp ).filter(df.m_date < i).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid')
            # previous_day =  df.filter(df.m_date < date_temp).select('fsa','fsid')
            result_new_user = current_day.subtract(previous_day)
            total_newuser = result_new_user.count()
            result_newuser = sc.parallelize([{
                "bucket":1,
                "m_date": int(current_date),
                # "m_date": int(date_temp),
                "newusers": int(total_newuser)
            }])
            

            rdd = raw.filter(lambda x: current_date <= int(x['m_date']) < future_date)
            # rdd = raw.filter(lambda x: date_temp <= int(x['m_date']) < i)
            if rdd.isEmpty() == False:
                table = rdd.toDF()
                total_user=table.dropDuplicates(['fsa',"fsid"]).count()

                result_total_user = sc.parallelize([{
                    "bucket":0,
                    "m_date": int(current_date),
                    # "m_date": int(date_temp),
                    "users": int(total_user)
예제 #9
0
			if song in song_map:
				listeners = song_map[song]
			else:
				listeners = songuserdb.select("user_id") \
						.where("song_id=? and req_time > ? and req_time < ?", int(song), five_weeks_back, now+1) \
						.map(lambda row: (row.user_id, 1)) \
						.reduceByKey(lambda count1, count2: count1+count2) \
						.sortBy(lambda x: x[1], ascending=False) \
						.map(lambda x: x[0]) \
						.take(11)
				song_map[song] = list(listeners)

			if user in listeners:
				listeners.remove(user)
			user_suggest += listeners

		if len(user_suggest) > 0 and len(songs) > 0:

			user_freq = Counter(user_suggest)

			rdd = sc.parallelize([{
				"user_id": user,
				"timestamp": now,
				"suggested_user_id": suggested_user[0],
				"relevance_score": round(float(suggested_user[1])/len(songs) , 3)
			} for suggested_user in user_freq.most_common(25)])

			rdd.saveToCassandra(config.CASSANDRA_KEYSPACE, "user_relevance")

	sc.stop()
                                                       ]).count()
            result_location_os = table_drop.groupBy(['location_os']).count()
            result_config_resolution = table_drop.groupBy(
                ['config_resolution']).count()
            #-------------------------------------------
            array_config_browser = []
            for row in result_config_browser.collect():
                x = {
                    'config_browser': row['config_browser'],
                    'browser_count': row['count'],
                    'm_date': current_date,
                    # 'm_date': date_temp,
                    'bucket': 4
                }
                array_config_browser.append(x)
            result_config_browser = sc.parallelize(array_config_browser)
            #------------------------------------------

            array_config_device = []
            for row in result_config_device.collect():
                x = {
                    'config_device': row['config_device'],
                    'device_count': row['count'],
                    # 'm_date': date_temp,
                    'm_date': current_date,
                    'bucket': 3
                }
                array_config_device.append(x)
            result_config_device = sc.parallelize(array_config_device)
            #------------------------------------------
예제 #11
0
		asciiTweet = text.encode('ascii', 'ignore')
		if textstat.lexicon_count(asciiTweet) == 0:
			return 0
		else:
			return textstat.flesch_kincaid_grade(asciiTweet)
	
	# make a new data frame with grade data
	# make a new function we can use with the withColumn function
	udfGrade = udf(processTweets, FloatType())
	gradeTweets = placeTweets.withColumn("grade", udfGrade("text"))
	
	# group and aggregate the data
	avgGradeByCity = gradeTweets.groupBy("place.full_name").agg({"grade" : "avg"})
        temp = avgGradeByCity.map(lambda row: {'place' : row.full_name,
                                               'avg_grade' : row["avg(grade)"]}).collect()
        sc.parallelize(temp).saveToCassandra(keyspace='twitter', table='avggradebycity')
 
	def wordCount(row):
	    # re-encode the tweet, and count the words
	    wordDict = dict()
	    tweet = row.text.encode('ascii', 'ignore').lower()
	    arrayTuples = extractor(tweet)
	    for element in arrayTuples:
		if element[0] not in wordDict and element[0] not in stop_words:
		    wordDict[element[0]] = element[1]
		elif element[0] not in stop_words:
		    wordDict[element[0]] += element[1]
 
 
	    # form the list of tuples
	    wordFreq = list()
예제 #12
0
from pyspark.conf import SparkConf
from pyspark_cassandra import CassandraSparkContext, Row
import pyspark_cassandra

conf = SparkConf().setAppName("PySpark Cassandra Test").set(
    "spark.cassandra.connection.host", "localhost")

sc = CassandraSparkContext(conf=conf)
rdd = sc.parallelize([{
    "subreddit": "politics",
    "word": "ketan",
    "count": 2,
    "score": 4
}])
rdd.saveToCassandra("reddit", "word_counter")
예제 #13
0
 .setAppName("spark-calculate-total-user") \
 .set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)

    while True:
        current_date = getGMT()
        future_date = getNextGMT()
        rdd = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","userid","fsa","fsid")\
                .filter(lambda x: current_date <= int(x['m_date']) < future_date)

        if rdd.isEmpty() == False:
            table = rdd.toDF()
            # table.show(truncate=False)
            total = table.dropDuplicates(['fsa', "fsid"]).count()

            result = sc.parallelize([{
                "bucket": 0,
                "m_date": int(current_date),
                "users": int(total)
            }])
        else:
            result = sc.parallelize([{
                "bucket": 0,
                "m_date": int(current_date),
                "users": 0
            }])
        result.saveToCassandra('web_analytic', 'user_daily_report')
        # break
        # time.sleep(2)
    pass
예제 #14
0
class UserItemSimilarity(object):
    def __init__(self):
        self.spark_config = SparkConf() \
            .setMaster("local[4]") \
            .setAppName("ContentBased") \
            .set("spark.cassandra.connection.host", "127.0.0.1")
        self.sparkContext = CassandraSparkContext(conf=self.spark_config)

        self.cluster = Cluster()
        self.session = self.cluster.connect("music_recommendation")

        cql_cmd = "SELECT * FROM %s"
        cmd = cql_cmd % "i_profile_artist"
        self.i_artists_res = self.session.execute(cmd)
        cmd = cql_cmd % "i_profile_composer"
        self.i_composers_res = self.session.execute(cmd)
        cmd = cql_cmd % "i_profile_genre"
        self.i_genres_res = self.session.execute(cmd)
        cmd = cql_cmd % "u_profile_artist"
        self.u_artists_res = self.session.execute(cmd)
        cmd = cql_cmd % "u_profile_composer"
        self.u_composers_res = self.session.execute(cmd)
        cmd = cql_cmd % "u_profile_genre"
        self.u_genres_res = self.session.execute(cmd)

        cql_cmd = "SELECT uid, song_id FROM %s"
        events = self.session.execute(cql_cmd % "user_event")
        self.events = dict()
        for event in events:
            songs = self.events.get(event.uid)
            if songs is None:
                self.events[event.uid] = [event.song_id]
            else:
                self.events[event.uid].append(event.song_id)

        self.session.execute("CREATE TABLE IF NOT EXISTS "
                             "result_cb_user_item_genre ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")
        self.session.execute("CREATE TABLE IF NOT EXISTS "
                             "result_cb_user_item_artist ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")
        self.session.execute("CREATE TABLE IF NOT EXISTS "
                             "result_cb_user_item_composer ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")

    @staticmethod
    def convert_i(rows):
        result = dict()
        for row in rows:
            result[row.sid] = row.profile

        return result

    @staticmethod
    def convert_u(rows):
        result = list()
        for row in rows:
            new_tuple = (row.uid, row.profile)
            result.append(new_tuple)

        return result

    def build(self):
        events = self.events

        i_genres = self.convert_i(self.i_genres_res)
        i_artists = self.convert_i(self.i_artists_res)
        i_composers = self.convert_i(self.i_composers_res)

        u_genres = self.convert_u(self.u_genres_res)
        u_artists = self.convert_u(self.u_artists_res)
        u_composers = self.convert_u(self.u_composers_res)

        dist_u_genres = self.sparkContext.parallelize(u_genres)
        dist_u_genres = dist_u_genres.map(
            lambda u: similar(u, i_genres, events))
        dist_u_genres.saveToCassandra("music_recommendation",
                                      "result_cb_user_item_genre")

        dist_u_artists = self.sparkContext.parallelize(u_artists)
        dist_u_artists = dist_u_artists.map(
            lambda u: similar(u, i_artists, events))
        dist_u_artists.saveToCassandra("music_recommendation",
                                       "result_cb_user_item_artist")

        dist_u_composer = self.sparkContext.parallelize(u_composers)
        dist_u_composer = dist_u_composer.map(
            lambda u: similar(u, i_composers, events))
        dist_u_composer.saveToCassandra("music_recommendation",
                                        "result_cb_user_item_composer")
예제 #15
0
from pyspark_cassandra import CassandraSparkContext
from pyspark import SparkConf

conf = SparkConf()
conf.set("spark.cassandra.connection.host", "192.168.15.87")
sc = CassandraSparkContext("spark://192.168.15.87:7077",
                           "Simple App",
                           conf=conf)
rdd = sc.cassandraTable("testkeyspace", "stock").select(
    "ric", "date", "time", "high", "low").groupBy(
        lambda r: r["date"] > 20050613 and r["date"] < 20170511).collect()

for gr in rdd:
    if gr[0]:
        new_rdd = sc.parallelize(list(gr[1]))

for time in [
        "9:30:00 AM", "10:30:00 AM", "11:30:00 AM", "12:30:00 PM",
        "1:30:00 PM", "2:30:00 PM"
]:
    rdd_temp = new_rdd.groupBy(lambda r: r["time"] == time)
    for r in rdd_temp.collect():
        if r[0]:
            for i in r[1]:
                print(i)  #each batch