def main(): pwords = load_wordlist("./Dataset/positive.txt") nwords = load_wordlist("./Dataset/negative.txt") conf = SparkConf().\ setMaster("local[2]").\ setAppName("TweeStreamer").\ set("spark.cassandra.connection.host",\ "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = CassandraSparkContext(conf=conf) sc.setLogLevel("WARN") sql = SQLContext(sc) # Creating a streaming context with batch interval of 1 sec ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream( ssc, topics=['twitter-topic1'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) #tweets = kstream.map(lambda x: json.loads( x[1].decode('utf-8'))) tweets = kstream.map(lambda x: json.loads(x[1])) tweetsUsentiment = tweets.map( lambda tweet: tweetwithSentiment(tweet, pwords, nwords)) #searchTermSentiment = tweetsUsentiment.pprint() tweetsUsentiment.saveToCassandra("tweetdb", "tweettable") ssc.start() ssc.awaitTerminationOrTimeout(100) ssc.stop(stopGraceFully=True)
def run_driver(keyspace, table, cass_host): conf = SparkConf().setAppName("PySpark Cassandra Sample Driver") conf.set("spark.cassandra.connection.host", cass_host) sc = CassandraSparkContext(conf=conf) # Read some data from Cassandra pixels = sc.cassandraTable(keyspace, table) print pixels.first() # Count unique visitors, notice that the data returned by Cassandra is # a dict-like, you can access partition, clustering keys as well as # columns by name. CQL collections: lists, sets and maps are converted # to proper Python data types visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\ .distinct() print "Visitors: {:,}".format(visitors.count()) # Insert some new pixels into the table pixels = ({ "customer_id": "example.com", "url": "http://example.com/article1/", "hour": dt.datetime(2014, 1, 2, 1), "ts": dt.datetime(2014, 1, 2, 1, 8, 23), "pixel_id": str(uuid4()), "data": { "visitor_id": "xyz" } }, ) saveToCassandra(sc.parallelize(pixels), keyspace, table) print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def feeder(start_date, end_date, e1, e2, q): conf = SparkConf().setAppName("Simple App").setMaster( "spark://127.0.0.1:7077").set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) a = "" l = ['"SP1"', '"SP2"'] asia = pytz.timezone("Asia/Kolkata") #creating a dataframe for the date range and ric names rdd = sc.cassandraTable("testkeyspace", "stock_test").select( "ric", "time_stamp", "high", "low").where("ric in ?", ["SP1", "SP2", "SP3"]).where( "time_stamp > ? and time_stamp < ?", datetime(2010, 11, 26, 12, 30, tzinfo=asia), datetime(2010, 12, 10, 12, 30, tzinfo=asia)).toDF() # making a batch according to the time_stamp rdd = rdd.orderBy("time_stamp").groupBy("time_stamp").agg( collect_list(struct('ric', 'time_stamp', 'high', 'low'))).collect() # sending one batch to analytical engine for gr in rdd: e2.clear() send = gr[1] q.put(send) #adding the batch to the queue e2.set() e1.wait()
def streaming_logic(): """ :function: initial spark context and all the streaming logic :return: None """ # - read configuration from file spark_config, kafka_config, cassandra_config = read_config() # - initial spark context conf = SparkConf().setMaster(spark_config['master']).setAppName(spark_config['app_name']).set('spark.cassandra.connection.host', cassandra_config['cluster']) csc = CassandraSparkContext(conf=conf) csc.setLogLevel(spark_config['log_level']) ssc = StreamingContext(sparkContext=csc, batchDuration=spark_config['time_window']) # - creating kafka stream directKafkaStream = KafkaUtils.createDirectStream(ssc, [kafka_config['topic_in']], {'metadata.broker.list': kafka_config['cluster']}) # - start to process data # - output data structure: MetadData structured_stock_data = directKafkaStream.map(lambda data : preprocess_data(data=data)) structured_stock_data.pprint(20) stock_data_list = structured_stock_data.reduceByKey(lambda a,b : aggregate_list(a,b)) stock_data_list.pprint(20) # - get history data from cassandra alert_user_data = stock_data_list.mapValues(lambda dictlist : compute_stock_tending_in_window(dict_list=dictlist)) alert_user_data.pprint(20) # - send alert to user alert_user_data.foreachRDD(lambda rdd : rdd.foreachPartition(lambda iter : send_alert_to_kafka(iterator=iter,kafka_config=kafka_config))) ssc.start() ssc.awaitTermination()
def setUpClass(cls): super(CassandraTestCase, cls).setUpClass() cls.sc = CassandraSparkContext(conf=SparkConf().setAppName("PySpark Cassandra Test")) cls.session = Cluster().connect() cls.session.execute(''' CREATE KEYSPACE IF NOT EXISTS test_pyspark_cassandra WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}; ''') cls.session.set_keyspace('test_pyspark_cassandra')
class SparkCassandra: appNameCassandra = "WikiOlapCassandra" appNameSQL = "WikiOlapSQL" master = "spark://"+socket.gethostname()+":7077" confCassandra = SparkConf() \ .setAppName(appNameCassandra) \ .setMaster(master) \ .set("spark.cassandra.connection.host", os.environ['CASSANDRA_PORT_9042_TCP_ADDR']) sc = CassandraSparkContext(conf=confCassandra) sqlContext = SQLContext(sc)
def __init__(self): self.spark_config = SparkConf()\ .setMaster("local[4]")\ .setAppName("Popularity")\ .set("spark.cassandra.connection.host", "127.0.0.1") self.sparkContext = CassandraSparkContext(conf=self.spark_config) self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") self.raw_data = self.session.execute("SELECT song_id, timestamp " "FROM user_event " "WHERE action_type='listen';") self.session.execute("DROP TABLE IF EXISTS result_popularity ;") self.session.execute("CREATE TABLE IF NOT EXISTS result_popularity (" "sid text PRIMARY KEY," "rank int);") self.current_year = datetime.datetime.now().year self.current_month = datetime.datetime.now().month
def __init__(self): self.spark_config = SparkConf() \ .setMaster("local[4]") \ .setAppName("ContentBased") \ .set("spark.cassandra.connection.host", "127.0.0.1") self.sparkContext = CassandraSparkContext(conf=self.spark_config) self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") cql_cmd = "SELECT * FROM %s" cmd = cql_cmd % "i_profile_artist" self.i_artists_res = self.session.execute(cmd) cmd = cql_cmd % "i_profile_composer" self.i_composers_res = self.session.execute(cmd) cmd = cql_cmd % "i_profile_genre" self.i_genres_res = self.session.execute(cmd) cmd = cql_cmd % "u_profile_artist" self.u_artists_res = self.session.execute(cmd) cmd = cql_cmd % "u_profile_composer" self.u_composers_res = self.session.execute(cmd) cmd = cql_cmd % "u_profile_genre" self.u_genres_res = self.session.execute(cmd) cql_cmd = "SELECT uid, song_id FROM %s" events = self.session.execute(cql_cmd % "user_event") self.events = dict() for event in events: songs = self.events.get(event.uid) if songs is None: self.events[event.uid] = [event.song_id] else: self.events[event.uid].append(event.song_id) self.session.execute("CREATE TABLE IF NOT EXISTS " "result_cb_user_item_genre (" "uid text PRIMARY KEY," "recommendations list<text>);") self.session.execute("CREATE TABLE IF NOT EXISTS " "result_cb_user_item_artist (" "uid text PRIMARY KEY," "recommendations list<text>);") self.session.execute("CREATE TABLE IF NOT EXISTS " "result_cb_user_item_composer (" "uid text PRIMARY KEY," "recommendations list<text>);")
def __init__(self): self.sparkConfig = SparkConf()\ .setMaster("local[4]")\ .setAppName("MCF")\ .set("spark.cassandra.connection.host", "127.0.0.1")\ .set("spark.cassandra.input.consistency.level", "LOCAL_ONE") self.sparkContext = CassandraSparkContext(conf=self.sparkConfig) self.rank = 10 self.numIteration = 10 self.numberOfPreds = 10 self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") self.rawData = self.session.execute("SELECT uid, song_id, payload " "FROM user_event " "WHERE action_type='rate'") self.session.execute("CREATE TABLE IF NOT EXISTS result_cf (" "uid text PRIMARY KEY," "recommendations list<text>);")
def main(): pwords = load_wordlist("../Dataset/positive.txt") nwords = load_wordlist("../Dataset/negative.txt") sterms = load_wordlist("../Dataset/keyWords.txt") conf = SparkConf().\ setMaster("local[2]").\ setAppName("TweeStreamer").\ set("spark.cassandra.connection.host",\ "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = CassandraSparkContext(conf=conf) sc.setLogLevel("WARN") # Creating a streaming context with batch interval of 10 sec ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream( ssc, topics=['twitter-topic1'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: json.loads(x[1])) tweets.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint() tweetsUsentiment = tweets.map( lambda tweet: tweetwithSentiment(tweet, pwords, nwords, sterms)) searchTermUsentiment = tweetsUsentiment.flatMap( lambda tweet: searchTermFunction(tweet, sterms)).reduceByKey( lambda a, b: a + b) searchTermUsentiment = searchTermUsentiment.map( lambda (key, value): { "searchterm": "_" + key, "insertion_time": datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), "sentiment": value }) searchTermUsentiment.pprint() searchTermUsentiment.saveToCassandra("tweetdb", "searchtermtable") # searchTermSentiment = tweetsUsentiment.map(lambda tweet: searchTermFunction(tweet,sterms)) ssc.start() ssc.awaitTerminationOrTimeout(1000) ssc.stop(stopGraceFully=True)
--conf spark.cassandra.connection.host=52.32.192.156,52.32.200.206,54.70.213.12 \ /home/ubuntu/pipeline/kafka_spark_cass_imageQuery.py localhost:2181 imgSearchRequests #Opening spark shell with cassandra $SPARK_HOME/bin/pyspark \ --master spark://ip-172-31-0-173:7077 \ --packages TargetHolding/pyspark-cassandra:0.3.5 \ --conf spark.cassandra.connection.host=52.32.192.156,52.32.200.206,54.70.213.12 """ db_table=0 #global rdd producer = KafkaProducer(bootstrap_servers = 'ec2-52-41-224-1.us-west-2.compute.amazonaws.com:9092', value_serializer=lambda v: json.dumps(v).encode('ascii')) # Kafka and Spark Streaming specific vars batch_interval = 5 #question, why is batch interval of 5 so much better than 3? 3 seemed like needed to wait a long time sc = CassandraSparkContext(appName="PythonStreamingVSS") #http://www.slideshare.net/JonHaddad/intro-to-py-spark-and-cassandra ssc = StreamingContext(sc, batch_interval) keyspace="vss_large" """ Example usages: db_table.select("hashvalue", "partitionby","videoname").map(lambda x: x['hashvalue']).take(3) will result in [u'6daab6a32cb6b209', u'77a888d7aa2f882b', u'571d23371cc358d5'] """ def main(): global db_table; global producer; if len(sys.argv) != 3: #print("Usage: thisfile.py <zk> <topic>", file=sys.stderr) #i get an error about file=sys.stderr for some reason
'test_93').on('name').collect()) self.assertEqual(len(joined), 2) if __name__ == '__main__': try: # connect to cassandra and create a keyspace for testing CassandraTestCase.session = Cluster().connect() CassandraTestCase.session.execute(''' CREATE KEYSPACE IF NOT EXISTS %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}; ''' % (CassandraTestCase.keyspace, )) CassandraTestCase.session.set_keyspace(CassandraTestCase.keyspace) # create a cassandra spark context CassandraTestCase.sc = CassandraSparkContext( conf=SparkConf().setAppName("PySpark Cassandra Test")) # perform the unit tests unittest.main() # suite = unittest.TestLoader().loadTestsFromTestCase(RegressionTest) # unittest.TextTestRunner().run(suite) finally: # stop the spark context and cassandra session # stop the spark context and cassandra session if hasattr(CassandraTestCase, 'sc'): CassandraTestCase.sc.stop() if hasattr(CassandraTestCase, 'session'): CassandraTestCase.session.shutdown()
return datetime.today().strftime("%Y-%m-%d %H:%M:%S") def process(rdd): spark = getSparkSessionInstance(rdd.context.getConf()) tweetsDataFrame = spark.read.json(rdd) df = tweetsDataFrame.withColumn('hashtag', func(tweetsDataFrame.text)) df = df.withColumn('date', transfer_time(tweetsDataFrame.time)) df.createOrReplaceTempView("historicaltweets") df = spark.sql( "SELECT MAX(date) AS date,hashtag,count(*) AS count FROM historicaltweets WHERE hashtag IS NOT NULL GROUP BY hashtag ORDER BY count DESC" ) rdd = df.rdd.map(tuple) rdd.saveToCassandra("twitter", "tweet") df.show() if __name__ == "__main__": sc = CassandraSparkContext(appName="tweet") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 600) topic_name = "twitter" streamFromKafka = KafkaUtils.createDirectStream( ssc, [topic_name], {"metadata.broker.list": '*'}) lines = streamFromKafka.map(lambda x: x[1]) lines.count().pprint() lines.foreachRDD(process) #text_counts = lines.map(lambda tweet: (tweet['hashtag'],1)).reduceByKey(lambda x,y: x + y) ssc.start() ssc.awaitTermination()
from pyspark_cassandra import CassandraSparkContext from pyspark import SparkConf conf = SparkConf() conf.set("spark.cassandra.connection.host", "192.168.15.87") sc = CassandraSparkContext("spark://192.168.15.87:7077", "Simple App", conf=conf) rdd = sc.cassandraTable("testkeyspace", "stock2").select("ric", "time_stamp", "high", "low").spanBy('time_stamp').collect() for gr in rdd: print(gr) #one batch print("+++++++++++++++")
# Connect to Cassandra cluster = Cluster(["127.0.0.1"]) session = cluster.connect() # Initialize keyspace in Cassandra session.execute("CREATE KEYSPACE IF NOT EXISTS meetups_space WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };") # Initialize table in Cassandra for country and event statistics session.execute("CREATE TABLE IF NOT EXISTS meetups_space.country_statistics (event_country text PRIMARY KEY, total_rsvps int);") session.execute("CREATE TABLE IF NOT EXISTS meetups_space.response_statistics (response text PRIMARY KEY, total_rsvps int);") session.execute("CREATE TABLE IF NOT EXISTS meetups_space.event_statistics (event_name text, group_name text, event_country text, response text, total_rsvps int, PRIMARY KEY ((event_name), response));") # Configured the spark stream conf = SparkConf().set("spark.cassandra.connection.host", "localhost") sc = CassandraSparkContext(appName='MeetupDashboard', conf=conf) ssc = StreamingContext(sc, 5) ssc.checkpoint("/tmp") topic = ["meetuptopic"] kafkaConf = {"metadata.broker.list": "localhost:9092", "zookeeper.connect": "localhost:2181", "group.id": "kafka-spark-streaming", "zookeeper.connection.timeout.ms": "1000"} # DStream messages = KafkaUtils.createDirectStream(ssc, topic, kafkaConf) # Pre-process DStream lines = messages.map(lambda (key, values): json.loads(values)) # Compute country statistics
#!/usr/bin/python3 from pyspark import SparkContext from pyspark.mllib.feature import HashingTF, IDF from pyspark.sql import SQLContext from pyspark_cassandra import CassandraSparkContext, Row import pandas # Configuracion sc = CassandraSparkContext() sqlContext = SQLContext(sc) # Carga una tabla en un keyspace determinado # retorna un dataframe (que después se puede pasar a un RDD) def load_and_get_table_df(keys_space_name, table_name): table_df = sqlContext.read\ .format("org.apache.spark.sql.cassandra")\ .options(table=table_name, keyspace=keys_space_name)\ .load() return table_df # Convierte datos de un RDD a un DataFrame def rdd_to_df(data): new_data = data.map(lambda row: (row, )) return sqlContext.createDataFrame(new_data) # Pasos que procesan data para el calculo de tfidf def tfidf(data): hashing = HashingTF()
from pyspark import SparkConf, SparkContext import pyspark_cassandra from pyspark_cassandra import CassandraSparkContext conf = SparkConf()\ .setAppName("PySpark Cassandra Test") \ .setMaster("local[2]") \ .set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") # .set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = CassandraSparkContext(conf=conf) print((sc.cassandraTable( "tweetdb", "tweettable").select("tweet").map(lambda a: a).collect())) #sc.pprint() #rdd = sc.parallelize([{"tweet":"first second third tweet"}]) #rdd.saveToCassandra( # "tweetdb", # "tweettable")
start_time = datetime.datetime(2010, 10, 7, 0, 0) sample_frequency = datetime.timedelta(minutes=30) num_tests = 2 list_times = [start_time + x * sample_frequency for x in range(num_tests)] num_of_meters = 30 window_size = datetime.timedelta(hours=24) init_model_params = {} meter_ids = get_meters() mk = 3 lrate = 0.75 SE = 0 i = 1 program_start_time = t.time() sc = CassandraSparkContext(appName="PySpark Cassandra Test", master="local[*]") '''DataFrame Tests''' #for current_time in list_times: current_time = list_times[0] readings = sc \ .cassandraTable("cer", "readings") \ .select("meter_id", "date", "measurement") \ .where("date <= '{}' AND date >= '{}'".format(current_time, current_time-mk*sample_frequency))\ .map(lambda x: (x["meter_id"], (x["date"], x["measurement"])))\ .groupByKey()\ .mapValues(lambda x: pd.Series(list(i[1] for i in x), index=list(i[0] for i in x))) model_parameters = sc \ .cassandraTable("cer", "models") \ .map(lambda x: (x["meter_id"], np.asanyarray(x["w"])))