def run_driver(keyspace, table, cass_host): conf = SparkConf().setAppName("PySpark Cassandra Sample Driver") conf.set("spark.cassandra.connection.host", cass_host) sc = CassandraSparkContext(conf=conf) # Read some data from Cassandra pixels = sc.cassandraTable(keyspace, table) print pixels.first() # Count unique visitors, notice that the data returned by Cassandra is # a dict-like, you can access partition, clustering keys as well as # columns by name. CQL collections: lists, sets and maps are converted # to proper Python data types visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\ .distinct() print "Visitors: {:,}".format(visitors.count()) # Insert some new pixels into the table pixels = ({ "customer_id": "example.com", "url": "http://example.com/article1/", "hour": dt.datetime(2014, 1, 2, 1), "ts": dt.datetime(2014, 1, 2, 1, 8, 23), "pixel_id": str(uuid4()), "data": { "visitor_id": "xyz" } }, ) saveToCassandra(sc.parallelize(pixels), keyspace, table) print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def run_driver(keyspace, table): conf = SparkConf().setAppName("PySpark Cassandra Sample Driver") conf.set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) # Read some data from Cassandra pixels = sc.cassandraTable(keyspace, table) print pixels.first() # Count unique visitors, notice that the data returned by Cassandra is # a dict-like, you can access partition, clustering keys as well as # columns by name. CQL collections: lists, sets and maps are converted # to proper Python data types visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\ .distinct() print "Visitors: {:,}".format(visitors.count()) # Insert some new pixels into the table pixels = ( { "customer_id": "example.com", "url": "http://example.com/article1/", "hour": dt.datetime(2014, 1, 2, 1), "ts": dt.datetime(2014, 1, 2, 1, 8, 23), "pixel_id": str(uuid4()), "data": {"visitor_id": "xyz"} }, ) saveToCassandra(sc.parallelize(pixels), keyspace, table) print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
class Popularity(object): def __init__(self): self.spark_config = SparkConf()\ .setMaster("local[4]")\ .setAppName("Popularity")\ .set("spark.cassandra.connection.host", "127.0.0.1") self.sparkContext = CassandraSparkContext(conf=self.spark_config) self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") self.raw_data = self.session.execute("SELECT song_id, timestamp " "FROM user_event " "WHERE action_type='listen';") self.session.execute("DROP TABLE IF EXISTS result_popularity ;") self.session.execute("CREATE TABLE IF NOT EXISTS result_popularity (" "sid text PRIMARY KEY," "rank int);") self.current_year = datetime.datetime.now().year self.current_month = datetime.datetime.now().month def _compare_date_time(self, month, year): if self.current_year == year: if self.current_month == month: return True else: return False def _handle_raw_data(self): new_data_set = list() for row in self.raw_data: month = row.timestamp.month year = row.timestamp.year if self._compare_date_time(month, year): new_tuple = tuple([row.song_id, 1]) new_data_set.append(new_tuple) return new_data_set def calculate(self): dist_data = self.sparkContext.parallelize(self._handle_raw_data()) counts = dist_data.reduceByKey(lambda a, b: a + b) counts = counts.sortBy(lambda a: a[1], ascending=False).take(10) result = self.sparkContext.parallelize(counts) result.saveToCassandra("music_recommendation", "result_popularity") print result.collect()
class MusicCollaborativeFiltering: def __init__(self): self.sparkConfig = SparkConf()\ .setMaster("local[4]")\ .setAppName("MCF")\ .set("spark.cassandra.connection.host", "127.0.0.1")\ .set("spark.cassandra.input.consistency.level", "LOCAL_ONE") self.sparkContext = CassandraSparkContext(conf=self.sparkConfig) self.rank = 10 self.numIteration = 10 self.numberOfPreds = 10 self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") self.rawData = self.session.execute("SELECT uid, song_id, payload " "FROM user_event " "WHERE action_type='rate'") self.session.execute("CREATE TABLE IF NOT EXISTS result_cf (" "uid text PRIMARY KEY," "recommendations list<text>);") def run(self): """ This function will run the collaborative filtering algorithm and get the predictions for the system. """ userEventData = handle_raw_data(self.rawData) dist_data = self.sparkContext.parallelize(userEventData) userMapIdBase, musicMapIdBase = get_id_based_maps(dist_data) user_event = convert_to_index_based(dist_data, userMapIdBase, musicMapIdBase) ratings = convert_to_rating_type(user_event) test_data = user_event.map(lambda a: (a[0], a[1])) model = ALS.train(ratings, self.rank, self.numIteration) predictions = model.predictAll(test_data).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join( predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() self.sparkContext.stop() return MSE
class MusicCollaborativeFiltering: def __init__(self): self.sparkConfig = SparkConf()\ .setMaster("local[4]")\ .setAppName("MCF")\ .set("spark.cassandra.connection.host", "127.0.0.1")\ .set("spark.cassandra.input.consistency.level", "LOCAL_ONE") self.sparkContext = CassandraSparkContext(conf=self.sparkConfig) self.rank = 10 self.numIteration = 10 self.numberOfPreds = 10 self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") self.rawData = self.session.execute("SELECT uid, song_id, payload " "FROM user_event " "WHERE action_type='rate'") self.session.execute("CREATE TABLE IF NOT EXISTS result_cf (" "uid text PRIMARY KEY," "recommendations list<text>);") def run(self): """ This function will run the collaborative filtering algorithm and get the predictions for the system. """ userEventData = handle_raw_data(self.rawData) dist_data = self.sparkContext.parallelize(userEventData) userMapIdBase, musicMapIdBase = get_id_based_maps(dist_data) user_event = convert_to_index_based(dist_data, userMapIdBase, musicMapIdBase) nonRatedUserMusic = get_non_rated_user_music(user_event) ratings = convert_to_rating_type(user_event) model = ALS.train(ratings, self.rank, self.numIteration) predictions = model.predictAll(nonRatedUserMusic) predictions = get_final_result(self.numberOfPreds, predictions, userMapIdBase, musicMapIdBase) predictions.saveToCassandra("music_recommendation", "result_cf", {"uid", "recommendations"})
col_tenant_id = 1 col_user_id = 2 col_item_id = 3 num_to_recomm_per_user = 10 num_to_recomm_per_item = 10 conf = SparkConf().setAppName("PysparkCollaborativeFiltering").set("spark.cassandra.connection.host", spark_cassandra_connection_host) sc = CassandraSparkContext(conf=conf) sc.setCheckpointDir('checkpoint/') data = sc.cassandraTable("mykeyspace", "transactions",row_format=1).collect() # row_format: tuple # (id, tenant_id, user_id, item_id) tenant_ids = set(list(map(lambda x:x[col_tenant_id],data))) data_rdd = sc.parallelize(data) # data_rdd = sc.parallelize(data).map(list) all_results_per_user = sc.emptyRDD() all_results_per_item = sc.emptyRDD() for t_id in tenant_ids: print("\nComputing recommendation for tenant {}...\n".format(t_id)) per_tenant_rdd = data_rdd.filter( lambda x: x[col_tenant_id] == t_id).map( lambda l: ((l[col_user_id],l[col_item_id]),1.0)).reduceByKey( lambda x,y: x + y).map( lambda x: (x[0][0],x[0][1],x[1])) recomm_per_user,recomm_per_item = recomm_cf.TrainAndComputeRecommendation(sc, per_tenant_rdd, num_to_recomm_per_user, num_to_recomm_per_item)
print("Usage: spark-calculate-pageview", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-pageview") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) i = 1511740800 while i <= 1514505600: # while True: date_temp = i i = i + 86400 # current_date = getGMT() # future_date = getNextGMT() rdd = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","location_path")\ .filter(lambda x: date_temp <= int(x['m_date']) < i) if rdd.isEmpty() == False: x = rdd.toDF().groupBy(['location_path']).count() # x.show() array = [] for row in x.collect(): x = { 'location_path': row['location_path'], 'm_date': date_temp, 'count':row['count'], 'bucket':5} array.append(x) result = sc.parallelize(array) result.saveToCassandra('web_analytic','page_view_report') break pass
current_date = getGMT() future_date = getNextGMT() # date_temp = i # i = i + 86400 raw = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","userid","fsa","fsid","location_path") if raw.isEmpty() == False: df = raw.toDF() current_day = df.filter( df.m_date >= current_date ).filter(df.m_date < future_date).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid') previous_day = df.filter(df.m_date < current_date).select('fsa','fsid') # current_day = df.filter( df.m_date >= date_temp ).filter(df.m_date < i).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid') # previous_day = df.filter(df.m_date < date_temp).select('fsa','fsid') result_new_user = current_day.subtract(previous_day) total_newuser = result_new_user.count() result_newuser = sc.parallelize([{ "bucket":1, "m_date": int(current_date), # "m_date": int(date_temp), "newusers": int(total_newuser) }]) rdd = raw.filter(lambda x: current_date <= int(x['m_date']) < future_date) # rdd = raw.filter(lambda x: date_temp <= int(x['m_date']) < i) if rdd.isEmpty() == False: table = rdd.toDF() total_user=table.dropDuplicates(['fsa',"fsid"]).count() result_total_user = sc.parallelize([{ "bucket":0, "m_date": int(current_date), # "m_date": int(date_temp), "users": int(total_user)
if song in song_map: listeners = song_map[song] else: listeners = songuserdb.select("user_id") \ .where("song_id=? and req_time > ? and req_time < ?", int(song), five_weeks_back, now+1) \ .map(lambda row: (row.user_id, 1)) \ .reduceByKey(lambda count1, count2: count1+count2) \ .sortBy(lambda x: x[1], ascending=False) \ .map(lambda x: x[0]) \ .take(11) song_map[song] = list(listeners) if user in listeners: listeners.remove(user) user_suggest += listeners if len(user_suggest) > 0 and len(songs) > 0: user_freq = Counter(user_suggest) rdd = sc.parallelize([{ "user_id": user, "timestamp": now, "suggested_user_id": suggested_user[0], "relevance_score": round(float(suggested_user[1])/len(songs) , 3) } for suggested_user in user_freq.most_common(25)]) rdd.saveToCassandra(config.CASSANDRA_KEYSPACE, "user_relevance") sc.stop()
]).count() result_location_os = table_drop.groupBy(['location_os']).count() result_config_resolution = table_drop.groupBy( ['config_resolution']).count() #------------------------------------------- array_config_browser = [] for row in result_config_browser.collect(): x = { 'config_browser': row['config_browser'], 'browser_count': row['count'], 'm_date': current_date, # 'm_date': date_temp, 'bucket': 4 } array_config_browser.append(x) result_config_browser = sc.parallelize(array_config_browser) #------------------------------------------ array_config_device = [] for row in result_config_device.collect(): x = { 'config_device': row['config_device'], 'device_count': row['count'], # 'm_date': date_temp, 'm_date': current_date, 'bucket': 3 } array_config_device.append(x) result_config_device = sc.parallelize(array_config_device) #------------------------------------------
asciiTweet = text.encode('ascii', 'ignore') if textstat.lexicon_count(asciiTweet) == 0: return 0 else: return textstat.flesch_kincaid_grade(asciiTweet) # make a new data frame with grade data # make a new function we can use with the withColumn function udfGrade = udf(processTweets, FloatType()) gradeTweets = placeTweets.withColumn("grade", udfGrade("text")) # group and aggregate the data avgGradeByCity = gradeTweets.groupBy("place.full_name").agg({"grade" : "avg"}) temp = avgGradeByCity.map(lambda row: {'place' : row.full_name, 'avg_grade' : row["avg(grade)"]}).collect() sc.parallelize(temp).saveToCassandra(keyspace='twitter', table='avggradebycity') def wordCount(row): # re-encode the tweet, and count the words wordDict = dict() tweet = row.text.encode('ascii', 'ignore').lower() arrayTuples = extractor(tweet) for element in arrayTuples: if element[0] not in wordDict and element[0] not in stop_words: wordDict[element[0]] = element[1] elif element[0] not in stop_words: wordDict[element[0]] += element[1] # form the list of tuples wordFreq = list()
from pyspark.conf import SparkConf from pyspark_cassandra import CassandraSparkContext, Row import pyspark_cassandra conf = SparkConf().setAppName("PySpark Cassandra Test").set( "spark.cassandra.connection.host", "localhost") sc = CassandraSparkContext(conf=conf) rdd = sc.parallelize([{ "subreddit": "politics", "word": "ketan", "count": 2, "score": 4 }]) rdd.saveToCassandra("reddit", "word_counter")
.setAppName("spark-calculate-total-user") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) while True: current_date = getGMT() future_date = getNextGMT() rdd = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","userid","fsa","fsid")\ .filter(lambda x: current_date <= int(x['m_date']) < future_date) if rdd.isEmpty() == False: table = rdd.toDF() # table.show(truncate=False) total = table.dropDuplicates(['fsa', "fsid"]).count() result = sc.parallelize([{ "bucket": 0, "m_date": int(current_date), "users": int(total) }]) else: result = sc.parallelize([{ "bucket": 0, "m_date": int(current_date), "users": 0 }]) result.saveToCassandra('web_analytic', 'user_daily_report') # break # time.sleep(2) pass
class UserItemSimilarity(object): def __init__(self): self.spark_config = SparkConf() \ .setMaster("local[4]") \ .setAppName("ContentBased") \ .set("spark.cassandra.connection.host", "127.0.0.1") self.sparkContext = CassandraSparkContext(conf=self.spark_config) self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") cql_cmd = "SELECT * FROM %s" cmd = cql_cmd % "i_profile_artist" self.i_artists_res = self.session.execute(cmd) cmd = cql_cmd % "i_profile_composer" self.i_composers_res = self.session.execute(cmd) cmd = cql_cmd % "i_profile_genre" self.i_genres_res = self.session.execute(cmd) cmd = cql_cmd % "u_profile_artist" self.u_artists_res = self.session.execute(cmd) cmd = cql_cmd % "u_profile_composer" self.u_composers_res = self.session.execute(cmd) cmd = cql_cmd % "u_profile_genre" self.u_genres_res = self.session.execute(cmd) cql_cmd = "SELECT uid, song_id FROM %s" events = self.session.execute(cql_cmd % "user_event") self.events = dict() for event in events: songs = self.events.get(event.uid) if songs is None: self.events[event.uid] = [event.song_id] else: self.events[event.uid].append(event.song_id) self.session.execute("CREATE TABLE IF NOT EXISTS " "result_cb_user_item_genre (" "uid text PRIMARY KEY," "recommendations list<text>);") self.session.execute("CREATE TABLE IF NOT EXISTS " "result_cb_user_item_artist (" "uid text PRIMARY KEY," "recommendations list<text>);") self.session.execute("CREATE TABLE IF NOT EXISTS " "result_cb_user_item_composer (" "uid text PRIMARY KEY," "recommendations list<text>);") @staticmethod def convert_i(rows): result = dict() for row in rows: result[row.sid] = row.profile return result @staticmethod def convert_u(rows): result = list() for row in rows: new_tuple = (row.uid, row.profile) result.append(new_tuple) return result def build(self): events = self.events i_genres = self.convert_i(self.i_genres_res) i_artists = self.convert_i(self.i_artists_res) i_composers = self.convert_i(self.i_composers_res) u_genres = self.convert_u(self.u_genres_res) u_artists = self.convert_u(self.u_artists_res) u_composers = self.convert_u(self.u_composers_res) dist_u_genres = self.sparkContext.parallelize(u_genres) dist_u_genres = dist_u_genres.map( lambda u: similar(u, i_genres, events)) dist_u_genres.saveToCassandra("music_recommendation", "result_cb_user_item_genre") dist_u_artists = self.sparkContext.parallelize(u_artists) dist_u_artists = dist_u_artists.map( lambda u: similar(u, i_artists, events)) dist_u_artists.saveToCassandra("music_recommendation", "result_cb_user_item_artist") dist_u_composer = self.sparkContext.parallelize(u_composers) dist_u_composer = dist_u_composer.map( lambda u: similar(u, i_composers, events)) dist_u_composer.saveToCassandra("music_recommendation", "result_cb_user_item_composer")
from pyspark_cassandra import CassandraSparkContext from pyspark import SparkConf conf = SparkConf() conf.set("spark.cassandra.connection.host", "192.168.15.87") sc = CassandraSparkContext("spark://192.168.15.87:7077", "Simple App", conf=conf) rdd = sc.cassandraTable("testkeyspace", "stock").select( "ric", "date", "time", "high", "low").groupBy( lambda r: r["date"] > 20050613 and r["date"] < 20170511).collect() for gr in rdd: if gr[0]: new_rdd = sc.parallelize(list(gr[1])) for time in [ "9:30:00 AM", "10:30:00 AM", "11:30:00 AM", "12:30:00 PM", "1:30:00 PM", "2:30:00 PM" ]: rdd_temp = new_rdd.groupBy(lambda r: r["time"] == time) for r in rdd_temp.collect(): if r[0]: for i in r[1]: print(i) #each batch