def feeder(start_date, end_date, e1, e2, q): conf = SparkConf().setAppName("Simple App").setMaster( "spark://127.0.0.1:7077").set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) a = "" l = ['"SP1"', '"SP2"'] asia = pytz.timezone("Asia/Kolkata") #creating a dataframe for the date range and ric names rdd = sc.cassandraTable("testkeyspace", "stock_test").select( "ric", "time_stamp", "high", "low").where("ric in ?", ["SP1", "SP2", "SP3"]).where( "time_stamp > ? and time_stamp < ?", datetime(2010, 11, 26, 12, 30, tzinfo=asia), datetime(2010, 12, 10, 12, 30, tzinfo=asia)).toDF() # making a batch according to the time_stamp rdd = rdd.orderBy("time_stamp").groupBy("time_stamp").agg( collect_list(struct('ric', 'time_stamp', 'high', 'low'))).collect() # sending one batch to analytical engine for gr in rdd: e2.clear() send = gr[1] q.put(send) #adding the batch to the queue e2.set() e1.wait()
def run_driver(keyspace, table): conf = SparkConf().setAppName("PySpark Cassandra Sample Driver") conf.set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) # Read some data from Cassandra pixels = sc.cassandraTable(keyspace, table) print pixels.first() # Count unique visitors, notice that the data returned by Cassandra is # a dict-like, you can access partition, clustering keys as well as # columns by name. CQL collections: lists, sets and maps are converted # to proper Python data types visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\ .distinct() print "Visitors: {:,}".format(visitors.count()) # Insert some new pixels into the table pixels = ( { "customer_id": "example.com", "url": "http://example.com/article1/", "hour": dt.datetime(2014, 1, 2, 1), "ts": dt.datetime(2014, 1, 2, 1, 8, 23), "pixel_id": str(uuid4()), "data": {"visitor_id": "xyz"} }, ) saveToCassandra(sc.parallelize(pixels), keyspace, table) print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def run_driver(keyspace, table, cass_host): conf = SparkConf().setAppName("PySpark Cassandra Sample Driver") conf.set("spark.cassandra.connection.host", cass_host) sc = CassandraSparkContext(conf=conf) # Read some data from Cassandra pixels = sc.cassandraTable(keyspace, table) print pixels.first() # Count unique visitors, notice that the data returned by Cassandra is # a dict-like, you can access partition, clustering keys as well as # columns by name. CQL collections: lists, sets and maps are converted # to proper Python data types visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\ .distinct() print "Visitors: {:,}".format(visitors.count()) # Insert some new pixels into the table pixels = ({ "customer_id": "example.com", "url": "http://example.com/article1/", "hour": dt.datetime(2014, 1, 2, 1), "ts": dt.datetime(2014, 1, 2, 1, 8, 23), "pixel_id": str(uuid4()), "data": { "visitor_id": "xyz" } }, ) saveToCassandra(sc.parallelize(pixels), keyspace, table) print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-device.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: spark-calculate-device.py ", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-device") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) sql = SQLContext(sc) while True: rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select( "config_device", "fsa") if rdd.isEmpty() == False: x = rdd.toDF().dropDuplicates(['fsa']) x = x.groupBy(['config_device']).count() array = [] for row in x.collect(): x = { 'config_device': row['config_device'], 'device_count': row['count'], 'bucket': 3 } array.append(x) result = sc.parallelize(array) result.saveToCassandra('web_analytic', 'device_report') # break
col_tenant_id = 1 col_user_id = 2 col_item_id = 3 num_to_recomm_per_user = 10 num_to_recomm_per_item = 10 conf = SparkConf().setAppName("PysparkCollaborativeFiltering").set("spark.cassandra.connection.host", spark_cassandra_connection_host) print ('conf') sc = CassandraSparkContext(conf=conf) sc.setCheckpointDir('checkpoint/') if LOAD_DATA_FROM_DB: data = sc.cassandraTable(cassandra_keyspace, cassandra_table, row_format=1).collect() # row_format: tuple # (id, tenant_id, user_id, item_id) tenant_ids = set(list(map(lambda x:x[col_tenant_id],data))) data_rdd = sc.parallelize(data) # data_rdd = sc.parallelize(data).map(list) all_results_per_user = sc.emptyRDD() all_results_per_item = sc.emptyRDD() for t_id in tenant_ids: print("\nComputing recommendation for tenant {}...\n".format(t_id)) per_tenant_rdd = data_rdd.filter( lambda x: x[col_tenant_id] == t_id).map( lambda l: ((l[col_user_id],l[col_item_id]),1.0)).reduceByKey( lambda x,y: x + y).map( lambda x: (x[0][0],x[0][1],x[1]))
if len(sys.argv) != 1: print("Usage: spark-calculate-pageview-total-user", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-pageview-total-user") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) # i = 1514505600 # while i <= 1514764800: while True: current_date = getGMT() future_date = getNextGMT() # date_temp = i # i = i + 86400 raw = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","userid","fsa","fsid","location_path") if raw.isEmpty() == False: df = raw.toDF() current_day = df.filter( df.m_date >= current_date ).filter(df.m_date < future_date).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid') previous_day = df.filter(df.m_date < current_date).select('fsa','fsid') # current_day = df.filter( df.m_date >= date_temp ).filter(df.m_date < i).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid') # previous_day = df.filter(df.m_date < date_temp).select('fsa','fsid') result_new_user = current_day.subtract(previous_day) total_newuser = result_new_user.count() result_newuser = sc.parallelize([{ "bucket":1, "m_date": int(current_date), # "m_date": int(date_temp), "newusers": int(total_newuser) }])
# X varchar, # Y varchar, # Count int, # PRIMARY KEY(ZipCode, TaxonomyCode1, ProviderNumber) #); if __name__ == "__main__": conf = SparkConf().setAppName("Query App").setMaster( "spark://spark01.cs.ucr.edu:7077") sc = CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # pulling data from cassandra x = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\ .map(lambda x: (x[0], x[1], x[2])) # BusinessPracticeLocationPostalCode -> ZipCode in providers.taxonomy_count y = sc.cassandraTable("providers", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("businesspracticelocationpostalcode", "taxonomycode1")\ .map(lambda x: (x[0], x[1])) df_x = sqlContext.createDataFrame(x) df_y = sqlContext.createDataFrame(y) # joining x and y # (zipcode, pop, meanincome, zipcode, businesspracticelocationpostalcode, taxonomycode1) # -> (zipcode, pop, meanincome, taxonomycode1) # -> (zipcode, pop, meanincome, taxonomycode1, count) def x_y_joinSeparator():
from pyspark_cassandra import CassandraSparkContext, Row from pyspark import SparkContext, SparkConf from subprocess import call import subprocess import commands aa=commands.getstatusoutput("b=0") conf = SparkConf() \ .setAppName("User Food Migration") \ .setMaster("spark://128.138.202.110:7077") \ .set("spark.cassandra.connection.host", "128.138.202.117") sc = CassandraSparkContext(conf=conf) users = sc.cassandraTable("junk", "trump2") trump = users.map(lambda x: {"tweet_id":x['tweet_id'], "tweet":x['tweet']} ) #to access Twitter API consumer_key = "43b4urzsW8nMY3oGzB5tIIM8B" consumer_secret = "fbGLMhkFyipYbTAz0s0S6yrN6cDGGWnEMmNaciceYjr4sgEdP2" garbage = 0 access_token = "2990432317-eYMpYm2Ck2G1YBPvWEq7Mf9wdgzBlOydabaxmzN" access_token_secret = "lQYcmiMlFdic9KSdmd6PClGQ3Swq8y9BgvVPOmqwhHjV2"
mongo_client.drop_database(db_out) mongo_client.close() print 'database cleared' col_tenant_id = 1 col_user_id = 2 col_item_id = 3 num_to_recomm_per_user = 10 num_to_recomm_per_item = 10 conf = SparkConf().setAppName("PysparkCollaborativeFiltering").set("spark.cassandra.connection.host", spark_cassandra_connection_host) sc = CassandraSparkContext(conf=conf) sc.setCheckpointDir('checkpoint/') data = sc.cassandraTable("mykeyspace", "transactions",row_format=1).collect() # row_format: tuple # (id, tenant_id, user_id, item_id) tenant_ids = set(list(map(lambda x:x[col_tenant_id],data))) data_rdd = sc.parallelize(data) # data_rdd = sc.parallelize(data).map(list) all_results_per_user = sc.emptyRDD() all_results_per_item = sc.emptyRDD() for t_id in tenant_ids: print("\nComputing recommendation for tenant {}...\n".format(t_id)) per_tenant_rdd = data_rdd.filter( lambda x: x[col_tenant_id] == t_id).map( lambda l: ((l[col_user_id],l[col_item_id]),1.0)).reduceByKey( lambda x,y: x + y).map( lambda x: (x[0][0],x[0][1],x[1]))
sqlContext = SQLContext(sc) # Make Spark less verbose logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) def parsePoint(data): #return LabeledPoint(data[3],np.append(data[0:3],data[4:])) return LabeledPoint(data[0], data[1:]) # store the data from cassandra to a data frame and remove the NA value data = sc.cassandraTable("msd_01", "songs").select("song_hotttnesss", "loudness", "year", "sentiment", "tempo", "unique_words").toDF() data = data.filter("year>0").na.drop() print data.count() # Scale the features with Standard Scaler data2 = data.map(lambda x: [ x.song_hotttnesss, x.loudness, x.year, x.sentiment, x.tempo, x.unique_words ]) #Convert each sql.row to an array scaler = StandardScaler(withMean=True, withStd=True).fit( data2) #fit a scaler on the every column scaledData = scaler.transform(data2) # transform our data # Transform to a labelled vector parsedData = scaledData.map(parsePoint)
#from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf=conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testTogether") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin') dtf = loc.localize(datetime.strptime(_from, "%Y-%d-%m %H:%M")) dtt = loc.localize(datetime.strptime(_to, "%Y-%d-%m %H:%M"))
from pyspark_cassandra import CassandraSparkContext, Row from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext conf = SparkConf().setAppName("NBARetrieval") \ .set("spark.cassandra.connection.timeout_ms","20000") \ .set("spark.cassandra.connection.host", "192.168.0.10") \ .set("spark.cassandra.auth.username", "mdi") \ .set("spark.cassandra.auth.password", "W2yIJw6ntl5RYC54VChe3lJoXa") sc = CassandraSparkContext(conf=conf) rdd = sc.cassandraTable("test", "kv") print rdd.first()
mongo_client= MongoClient() mongo_client.drop_database(db_out) # print 'database cleared' num_to_recomm_per_user = 10 num_to_recomm_per_item = 10 conf = SparkConf().setAppName("PysparkCollaborativeFiltering") print 'conf' sc = CassandraSparkContext(conf=conf) sc.setCheckpointDir('checkpoint/') if LOAD_DATA_FROM_DB: data_rdd = sc.cassandraTable(cassandra_keyspace, cassandra_table) # row_format: Row # print data t1 = time.time() tenant_ids = data_rdd.map(lambda trans:trans[col_tenant_id]).distinct().collect() elapsed = (time.time() - t0) print ("\nIt took %.2fsec to complete" % elapsed) t1 = time.time() cluster = Cluster() session = cluster.connect(cassandra_keyspace) string = 'SELECT DISTINCT ' + col_tenant_id + ' from ' + cassandra_table tenant_ids = ession.execute(string) elapsed = (time.time() - t0) print ("\nIt took %.2fsec to complete" % elapsed)
conf = SparkConf().setAppName("Regression on Song Hotness Analysis").setMaster("spark://muziki:7077") sc= CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # Make Spark less verbose logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) def parsePoint(data): #return LabeledPoint(data[3],np.append(data[0:3],data[4:])) return LabeledPoint(data[0],data[1:]) # store the data from cassandra to a data frame and remove the NA value data=sc.cassandraTable("msd_01", "songs").select("song_hotttnesss","loudness","year","sentiment","tempo","unique_words").toDF() data=data.filter("year>0").na.drop() print data.count() # Scale the features with Standard Scaler data2=data.map(lambda x: [x.song_hotttnesss, x.loudness,x.year, x.sentiment,x.tempo,x.unique_words])#Convert each sql.row to an array scaler= StandardScaler(withMean=True, withStd=True).fit(data2) #fit a scaler on the every column scaledData = scaler.transform(data2)# transform our data # Transform to a labelled vector parsedData = scaledData.map(parsePoint) # # Build the model model = LinearRegressionWithSGD.train(parsedData, iterations=1000,regParam=1.0,regType="l2",intercept=True)
""" spark-submit --packages anguenot:pyspark-cassandra:0.7.0 recommendation_engine/backup_user_event.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: last_like.py ", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("last-like") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) sql = SQLContext(sc) while True: current_date = getGMT() future_date = getNextGMT() rdd = sc.cassandraTable("db","user_event_model").select("idx_user","idx_movie","rating","time","type_event")\ .filter(lambda x: current_date <= int(x['time']) < future_date) if rdd.isEmpty() == False: rdd.toDF().write\ .format('com.databricks.spark.csv')\ .option("header", "true")\ .save('/home/trantu/Desktop/engine_recommendation.git/trunk/meta-data/user_event/' \ +datetime.now().strftime('%Y_%m_%d_%H_%M_%S.csv')) # rdd.toDF().write.csv('/home/tutn6/Desktop/engine_recommendation.git/trunk/mycsv') # rdd.deleteFromCassandra("db","user_event_model") break
if len(sys.argv) != 1: print("Usage: spark-calculate-pageview", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-pageview") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) i = 1511740800 while i <= 1514505600: # while True: date_temp = i i = i + 86400 # current_date = getGMT() # future_date = getNextGMT() rdd = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","location_path")\ .filter(lambda x: date_temp <= int(x['m_date']) < i) if rdd.isEmpty() == False: x = rdd.toDF().groupBy(['location_path']).count() # x.show() array = [] for row in x.collect(): x = { 'location_path': row['location_path'], 'm_date': date_temp, 'count':row['count'], 'bucket':5} array.append(x) result = sc.parallelize(array) result.saveToCassandra('web_analytic','page_view_report') break pass
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-language.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: spark-calculate-language.py ", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-language") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) sql = SQLContext(sc) while True: rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select( "location_browser_lan", "fsa") if rdd.isEmpty() == False: x = rdd.toDF().dropDuplicates(['fsa']) x = x.groupBy(['location_browser_lan']).count() array = [] for row in x.collect(): x = { 'browser_language': row['location_browser_lan'], 'count': row['count'], 'bucket': 6 } array.append(x) result = sc.parallelize(array) result.saveToCassandra('web_analytic', 'browser_language_report') # break
from pyspark_cassandra import CassandraSparkContext from operator import add start_time = time.time() if __name__ == "__main__": conf = SparkConf().setAppName("Query App").setMaster( "spark://spark01.cs.ucr.edu:7077") sc = CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # pulling data from cassandra x = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\ .map(lambda x: (x[0], x[1], x[2])) \ y = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "numberofdoctors")\ .map(lambda x: (x[0], x[1]))\ .reduceByKey(lambda x,y: x + y) df_x = sqlContext.createDataFrame(x) df_y = sqlContext.createDataFrame(y) # (zipcode, pop, meanincome, zipcode, numberofdoctors) -> (zipcode, pop, meanincome, numberofdoctors, pop/numberofdoctors) def joinSeparator(): return lambda x: (x[0], x[1], x[2], x[4], float(x[1]) / x[4]) cond = [df_x._1 == df_y._1] popDoctorRatioQueryRDD = df_x.join(df_y, cond) \ .map(joinSeparator()) \
from pyspark_cassandra import CassandraSparkContext from pyspark import SparkConf conf = SparkConf() conf.set("spark.cassandra.connection.host", "192.168.15.87") sc = CassandraSparkContext("spark://192.168.15.87:7077", "Simple App", conf=conf) rdd = sc.cassandraTable("testkeyspace", "stock2").select("ric", "time_stamp", "high", "low").spanBy('time_stamp').collect() for gr in rdd: print(gr) #one batch print("+++++++++++++++")
from operator import add start_time = time.time() if __name__ == "__main__": conf = SparkConf().setAppName("Query App").setMaster("spark://spark01.cs.ucr.edu:7077") sc = CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # pulling data from cassandra # num of hospitals RDD (zipcode, numofhospitals) x = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode")\ .map(lambda x: (x, 1))\ .reduceByKey(add)\ .map(lambda x: (x[0][0], x[1])) y = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "providernumber", "hospitalname", "x", "y", "numberofdoctors")\ .map(lambda x: (x[0], x[1], x[2], x[3], x[4], x[5])) z = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\ .map(lambda x: (x[0], x[1], x[2])) df_x = sqlContext.createDataFrame(x) df_y = sqlContext.createDataFrame(y) df_z = sqlContext.createDataFrame(z) # (zipcode, numofhospitals, zipcode, providernumber, hospitalname, x, y, numberofdoctors) -> (zipcode, numofhospitals, providernumber, hospitalname, x, y, numberofdoctors) def x_y_joinSeparator(): return lambda x: (x[0], x[1], x[3], x[4], x[5], x[6], x[7])
# Y varchar, # Count int, # MeanIncome int, # PRIMARY KEY(ZipCode, TaxonomyCode1, ProviderNumber) #); if __name__ == "__main__": conf = SparkConf().setAppName("Query App").setMaster( "spark://spark01.cs.ucr.edu:7077") sc = CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # pulling data from cassandra x = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\ .map(lambda x: (x[0], x[1], x[2])) y = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "providernumber", "hospitalname", "x", "y")\ .map(lambda x: (x[0], x[1], x[2], x[3], x[4])) # BusinessPracticeLocationPostalCode -> ZipCode in providers.taxonomy_count z = sc.cassandraTable("providers", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("businesspracticelocationpostalcode", "taxonomycode1")\ .map(lambda x: (x[0], x[1])) # num of hospitals RDD (zipcode, numberofhospitals) a = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode")\ .map(lambda x: (x, 1))\ .reduceByKey(add)\ .map(lambda x: (x[0][0], x[1])) df_x = sqlContext.createDataFrame(x)
if __name__ == "__main__": conf = SparkConf().setAppName("UserUserRelevance").setMaster(config.SPARK_MASTER).set("spark.cassandra.connection.host", config.CASSANDRA_SEED_NODE_IP) sc = CassandraSparkContext(conf=conf) filename = datetime.now().strftime("%Y-%m-%d")+"-usersonglog.txt" users = sc.textFile(config.HDFS_URL+":"+config.HDFS_PORT+config.LOG_FOLDER+filename) \ .filter(time_range_filter) \ .map(parse_log_entry) \ .keys() \ .collect() song_map = {} # store song to user mapping for use in later stages usersongdb = sc.cassandraTable(config.CASSANDRA_KEYSPACE, "user_to_song") songuserdb = sc.cassandraTable(config.CASSANDRA_KEYSPACE, "song_to_user") for user in users: user_suggest = [] song_list = usersongdb.select("song_id") \ .where("user_id=? and req_time > ? and req_time < ?", int(user), five_weeks_back, now+1) \ .map(lambda row: row.song_id) \ .distinct() \ .collect() songs = list(set(song_list)) for song in songs: if song in song_map: listeners = song_map[song] else:
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-total-user.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: spark-process-data", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-total-user") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) while True: current_date = getGMT() future_date = getNextGMT() rdd = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","userid","fsa","fsid")\ .filter(lambda x: current_date <= int(x['m_date']) < future_date) if rdd.isEmpty() == False: table = rdd.toDF() # table.show(truncate=False) total = table.dropDuplicates(['fsa', "fsid"]).count() result = sc.parallelize([{ "bucket": 0, "m_date": int(current_date), "users": int(total) }]) else: result = sc.parallelize([{ "bucket": 0, "m_date": int(current_date),
result = '' for i in range(3000): if vecFirst[0] > 0: result = '%s;%d %.4f' % (result, i, vecFirst[i]) return result conf = SparkConf()\ .set("spark.cassandra.connection.host", "localhost") sc = CassandraSparkContext(conf=conf) vecSum = sc.cassandraTable('reishi', 'dockmeans')\ .select("cluster_id", "vector")\ .where("cluster_id=?", 0)\ .map(lambda x: (x['cluster_id'], x['vector']))\ .reduceByKey(lambda x, y: maxVector(x, y))\ .collect() vector = [] #print(vecSum) v = vecSum[0] v = v[1] #print('=================================' + v) splt = v.split(';') for t in splt: p = t.split(' ') if len(p) > 1: tp = tpVector(int(p[0]), float(p[1])) vector.append(tp)
sc = CassandraSparkContext(conf=conf) def retTuple(r): age = int(r["age"]) if age < 20: return ("<20", 1) if age < 40: return ("20 < 40", 1) if age < 60: return ("40 < 60", 1) return (">60", 1) result = sc.cassandraTable("zeus", "node") \ .select("age") \ .where("type=?", "person") \ .map(retTuple) \ .reduceByKey(lambda a, b: a + b) \ .collect() print print "================================" print "AGE DEMOGRAPHICS" print "================================" for row in result: print str(row[0]) + "\t\t" + str(row[1]) print print "================================" print
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-location.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: spark-calculate-location.py ", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-location") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) sql = SQLContext(sc) while True: rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select( "location_country_name", "location_country_code", "fsa") # rdd.toDF().show() if rdd.isEmpty() == False: x = rdd.toDF().dropDuplicates(['fsa']) x = x.groupBy(['location_country_name', 'location_country_code']).count() array = [] for row in x.collect(): x = { 'location_country_name': row['location_country_name'], 'location_country_code': row['location_country_code'], 'location_count': row['count'], 'bucket': 2 } array.append(x)
from pyspark_cassandra import CassandraSparkContext from operator import add start_time = time.time() if __name__ == "__main__": conf = SparkConf().setAppName("Tool App").setMaster( "spark://spark01.cs.ucr.edu:7077") sc = CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # pulling data from cassandra taxonomyRDD = sc.cassandraTable( "providers", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("taxonomycode1") customSchema = StructType( [StructField("TaxonomyCode1", StringType(), True)]) taxonomyDF = sqlContext.createDataFrame(taxonomyRDD, customSchema).distinct() with open("/home/cs179g/logs/taxonomyList", "w") as outfile: for row in taxonomyDF.rdd.collect(): outfile.write(row[0] + '\n') sc.stop()
from pyspark_cassandra import CassandraSparkContext from pyspark import SparkConf conf = SparkConf() \ .setAppName("ZeusDB") \ .setMaster("local") \ .set("spark.cassandra.connection.host", "YOUR_CLUSTER_HOST_NAME") sc = CassandraSparkContext(conf=conf) result = sc.cassandraTable("zeus", "edge") \ .select("destination", "type") \ .filter(lambda x: x["type"] == "friend") \ .map(lambda x: (x["destination"], 1)) \ .reduceByKey(lambda a, b: a + b) \ .top(10, key=lambda x: x[1]) print print "================================" print "TOP 10 PEOPLE WITH MOST FRIENDS" print "================================" for row in result: print str(row[0]) + "\t\t" + str(row[1]) print print "================================" print
# i = 1514505600 # while i <= 1515110400: while True: # date_temp = i # i = i + 86400 current_date = getGMT() future_date = getNextGMT() rdd = sc.cassandraTable("web_analytic","fsa_log_visit")\ .select( "config_browser", "config_device", "fsa", "location_browser_lan", "location_country_name", "location_country_code", "m_date", "location_path", "location_city_name", "config_resolution", "location_os" )\ .filter(lambda x: current_date <= int(x['m_date']) < future_date) # .filter(lambda x: date_temp <= int(x['m_date']) < i) # 1514332800 if rdd.isEmpty() == False: table_drop = rdd.toDF().dropDuplicates(['fsa']) # table_drop.show() # break result_config_browser = table_drop.groupBy(['config_browser'
#from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin') dtf = loc.localize(datetime.strptime(_from, "%Y-%d-%m %H:%M")) dtt = loc.localize(datetime.strptime(_to, "%Y-%d-%m %H:%M")) def inner(row):
startDate = str(sys.argv[1]) endDate = str(sys.argv[2]) conf = ( SparkConf() .setAppName("User Food Migration") .setMaster("spark://128.138.202.110:7077") .set("spark.cassandra.connection.host", "128.138.202.117") ) sc = CassandraSparkContext(conf=conf) if __name__ == "__main__": rdd = sc.cassandraTable("junk", "bernie4") temp = 0 # returns list of tweets listBernie = ( rdd.filter(lambda row: row.created_at[4:] > startDate) .filter(lambda row: row.created_at[4:] < endDate) .collect() ) for tweet in listBernie: if tweet.retweet_count > 0: print tweet.retweet_count temp += 1 if tweet.favorite_count > 0: print tweet.favorite_count temp += 1 if tweet.coordinates != None:
from pyspark_cassandra import CassandraSparkContext, Row from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext # needed for toDF() conf = SparkConf() \ .setAppName("User Food Migration") \ .setMaster("spark://127.0.0.1:7077") \ .set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) sql = SQLContext(sc) users = sc.cassandraTable("demo", "user").toDF() food_count = users.select("favorite_food").groupBy("favorite_food").count()
KEYSPACE = "undefined" # mettez "share" si vous n'avez pas fait l'injection des données # vérification que le keyspace a bien été défini correctement if KEYSPACE == "undefined": raise Exception("Vous n'avez pas changé le keyspace, éditez le début du script") # lancement par spark-submit --py-files /usr/lib/spark/jars/pyspark-cassandra-0.7.0.jar livres.py # nom de l'application appName = "TP5 partie Spark/Cassandra" # bibliothèques pour travailler avec Cassandra from pyspark import SparkConf from pyspark_cassandra import CassandraSparkContext # contexte d'exécution pour spark-submit conf = SparkConf() \ .setAppName(appName) \ .setMaster("spark://master:7077") \ .set("spark.cassandra.connection.host", "master") csc = CassandraSparkContext(conf=conf) # ouvrir une table Cassandra à l'aide de csc livres = csc.cassandraTable(KEYSPACE, "livres"); # nombre de livres de Jules Verne print livres.filter(lambda livre: livre.auteur=="Jules Verne").count()
from pyspark_cassandra import CassandraSparkContext from pyspark import SparkConf conf = SparkConf() \ .setAppName("ZeusDB") \ .setMaster("local") \ .set("spark.cassandra.connection.host", "YOUR_CLUSTER_HOST_NAME") sc = CassandraSparkContext(conf=conf) result = sc.cassandraTable("zeus", "edge") \ .select("destination", "visit_count", "type") \ .filter(lambda x: x["type"] == "visited") \ .map(lambda x: (x["destination"], int(x["visit_count"]))) \ .reduceByKey(lambda a, b: a + b) \ .top(10, key=lambda x: x[1]) \ print print "================================" print "TOP 10 FREQUENTLY VISITED PLACES" print "================================" for row in result: print str(row[0]) + "\t\t" + str(row[1]) print print "================================" print
from operator import add start_time = time.time() if __name__ == "__main__": conf = SparkConf().setAppName("Query App").setMaster( "spark://spark01.cs.ucr.edu:7077") sc = CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # pulling data from cassandra hospitalNameList = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("hospitalname", "numberofdoctors")\ .map(lambda x: (x[0], x[1]))\ .reduceByKey(lambda x,y: x + y) #for row in popDoctorRatioRDD: #print (row) dfList = hospitalNameList.collect() # if you do left outer join, the results that are null for the census show up because hospitals have them customSchema = StructType([ \ StructField("HospitalName", StringType(), True), \ StructField("NumberOfDoctors", IntegerType(), True) ]) hospitalNameDF = sqlContext.createDataFrame(dfList, customSchema) dfJSONRDD = hospitalNameDF.toJSON().collect()
from pyspark_cassandra import CassandraSparkContext, Row from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("PySpark Cassandra Test").set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) data = sc.cassandraTable("mykeyspace", "user",row_format = 1).collect() rdd = sc.parallelize(data) print (rdd.collect())
ETNIA = { 1: 'BRANCA', 2: 'PRETA', 3: 'PARDA', 4: 'AMARELA', 5: 'INDÍGENA', } # Cria o SparkContext conf = SparkConf() \ .setAppName("Pergunta1") \ .set("spark.cassandra.connection.host", "10.7.40.94") csc = CassandraSparkContext(conf=conf) # Prepara os RDDs das tabelas. candidatos = csc.cassandraTable("eleicoes", "candidatos2014") resultados = csc.cassandraTable("eleicoes", "resultados2014") # Busca o código dos candidatos eleitos e dinstintos (para desconsiderar segundo turno). # Existem 3 tipos de candidatos eleitos (1, 2, 3). cod_eleitos1 = resultados.select('sq_candidato').where( "codigo_sit_cand_tot=? ", 1) cod_eleitos2 = resultados.select('sq_candidato').where( "codigo_sit_cand_tot=? ", 2) cod_eleitos3 = resultados.select('sq_candidato').where( "codigo_sit_cand_tot=? ", 3) # Une os códigos dos candidatos eleitos (os 3 tipos). cod_eleitos = cod_eleitos1.union(cod_eleitos2).union(cod_eleitos3).map( lambda row: (row['sq_candidato'], 0))
#from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testTogether") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin') dtf = loc.localize(datetime.strptime(_from, "%Y-%d-%m %H:%M")) dtt = loc.localize(datetime.strptime(_to, "%Y-%d-%m %H:%M")) def inner(row):
""" spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-last-like.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: spark-calculate-last-like.py ", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-last-like") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) sql = SQLContext(sc) while True: rdd = sc.cassandraTable("db","user_event_model").select("config_browser","m_date") if rdd.isEmpty() == False: x = rdd.toDF().groupBy(['config_browser']).count() array = [] for row in x.collect(): x = { 'config_browser': row['config_browser'], 'browser_count': row['count'], 'bucket':4 } array.append(x) result = sc.parallelize(array) result.saveToCassandra('test','browser_report')
from pyspark import SparkConf, SparkContext import pyspark_cassandra from pyspark_cassandra import CassandraSparkContext conf = SparkConf()\ .setAppName("PySpark Cassandra Test") \ .setMaster("local[2]") \ .set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") # .set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = CassandraSparkContext(conf=conf) print((sc.cassandraTable( "tweetdb", "tweettable").select("tweet").map(lambda a: a).collect())) #sc.pprint() #rdd = sc.parallelize([{"tweet":"first second third tweet"}]) #rdd.saveToCassandra( # "tweetdb", # "tweettable")
from pyspark_cassandra import CassandraSparkContext from pyspark import SparkConf conf = SparkConf() conf.set("spark.cassandra.connection.host", "192.168.15.87") sc = CassandraSparkContext("spark://192.168.15.87:7077", "Simple App", conf=conf) rdd = sc.cassandraTable("testkeyspace", "stock").select( "ric", "date", "time", "high", "low").groupBy( lambda r: r["date"] > 20050613 and r["date"] < 20170511).collect() for gr in rdd: if gr[0]: new_rdd = sc.parallelize(list(gr[1])) for time in [ "9:30:00 AM", "10:30:00 AM", "11:30:00 AM", "12:30:00 PM", "1:30:00 PM", "2:30:00 PM" ]: rdd_temp = new_rdd.groupBy(lambda r: r["time"] == time) for r in rdd_temp.collect(): if r[0]: for i in r[1]: print(i) #each batch