예제 #1
0
def main():
    pwords = load_wordlist("./Dataset/positive.txt")
    nwords = load_wordlist("./Dataset/negative.txt")

    conf = SparkConf().\
        setMaster("local[2]").\
        setAppName("TweeStreamer").\
        set("spark.cassandra.connection.host",\
        "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")
    sc = CassandraSparkContext(conf=conf)
    sc.setLogLevel("WARN")
    sql = SQLContext(sc)
    # Creating a streaming context with batch interval of 1 sec
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    kstream = KafkaUtils.createDirectStream(
        ssc,
        topics=['twitter-topic1'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})

    #tweets = kstream.map(lambda x: json.loads( x[1].decode('utf-8')))
    tweets = kstream.map(lambda x: json.loads(x[1]))
    tweetsUsentiment = tweets.map(
        lambda tweet: tweetwithSentiment(tweet, pwords, nwords))

    #searchTermSentiment =
    tweetsUsentiment.pprint()

    tweetsUsentiment.saveToCassandra("tweetdb", "tweettable")

    ssc.start()
    ssc.awaitTerminationOrTimeout(100)
    ssc.stop(stopGraceFully=True)
예제 #2
0
def run_driver(keyspace, table, cass_host):
    conf = SparkConf().setAppName("PySpark Cassandra Sample Driver")
    conf.set("spark.cassandra.connection.host", cass_host)
    sc = CassandraSparkContext(conf=conf)

    # Read some data from Cassandra
    pixels = sc.cassandraTable(keyspace, table)
    print pixels.first()

    # Count unique visitors, notice that the data returned by Cassandra is
    # a dict-like, you can access partition, clustering keys as well as
    # columns by name. CQL collections: lists, sets and maps are converted
    # to proper Python data types
    visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\
                .distinct()
    print "Visitors: {:,}".format(visitors.count())

    # Insert some new pixels into the table
    pixels = ({
        "customer_id": "example.com",
        "url": "http://example.com/article1/",
        "hour": dt.datetime(2014, 1, 2, 1),
        "ts": dt.datetime(2014, 1, 2, 1, 8, 23),
        "pixel_id": str(uuid4()),
        "data": {
            "visitor_id": "xyz"
        }
    }, )
    saveToCassandra(sc.parallelize(pixels), keyspace, table)
    print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
예제 #3
0
def feeder(start_date, end_date, e1, e2, q):

    conf = SparkConf().setAppName("Simple App").setMaster(
        "spark://127.0.0.1:7077").set("spark.cassandra.connection.host",
                                      "127.0.0.1")

    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    a = ""
    l = ['"SP1"', '"SP2"']
    asia = pytz.timezone("Asia/Kolkata")

    #creating a dataframe for the date range and ric names
    rdd = sc.cassandraTable("testkeyspace", "stock_test").select(
        "ric", "time_stamp", "high",
        "low").where("ric in ?", ["SP1", "SP2", "SP3"]).where(
            "time_stamp > ? and time_stamp < ?",
            datetime(2010, 11, 26, 12, 30, tzinfo=asia),
            datetime(2010, 12, 10, 12, 30, tzinfo=asia)).toDF()
    # making a batch according to the time_stamp
    rdd = rdd.orderBy("time_stamp").groupBy("time_stamp").agg(
        collect_list(struct('ric', 'time_stamp', 'high', 'low'))).collect()
    # sending one batch to analytical engine
    for gr in rdd:
        e2.clear()
        send = gr[1]
        q.put(send)  #adding the batch to the queue
        e2.set()
        e1.wait()
def streaming_logic():
    """
    :function: initial spark context and all the streaming logic
    :return: None
    """

    # - read configuration from file
    spark_config, kafka_config, cassandra_config = read_config()

    # - initial spark context
    conf = SparkConf().setMaster(spark_config['master']).setAppName(spark_config['app_name']).set('spark.cassandra.connection.host', cassandra_config['cluster'])
    csc = CassandraSparkContext(conf=conf)
    csc.setLogLevel(spark_config['log_level'])
    ssc = StreamingContext(sparkContext=csc, batchDuration=spark_config['time_window'])

    # - creating kafka stream
    directKafkaStream = KafkaUtils.createDirectStream(ssc, [kafka_config['topic_in']], {'metadata.broker.list': kafka_config['cluster']})

    # - start to process data
    # - output data structure: MetadData
    structured_stock_data = directKafkaStream.map(lambda data : preprocess_data(data=data))
    structured_stock_data.pprint(20)

    stock_data_list = structured_stock_data.reduceByKey(lambda a,b : aggregate_list(a,b))
    stock_data_list.pprint(20)

    # - get history data from cassandra
    alert_user_data = stock_data_list.mapValues(lambda dictlist : compute_stock_tending_in_window(dict_list=dictlist))
    alert_user_data.pprint(20)

    # - send alert to user
    alert_user_data.foreachRDD(lambda rdd : rdd.foreachPartition(lambda iter : send_alert_to_kafka(iterator=iter,kafka_config=kafka_config)))

    ssc.start()
    ssc.awaitTermination()
예제 #5
0
 def setUpClass(cls):
     super(CassandraTestCase, cls).setUpClass()
     cls.sc = CassandraSparkContext(conf=SparkConf().setAppName("PySpark Cassandra Test"))
     cls.session = Cluster().connect()
     cls.session.execute('''
         CREATE KEYSPACE IF NOT EXISTS test_pyspark_cassandra
         WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
     ''')
     cls.session.set_keyspace('test_pyspark_cassandra')
예제 #6
0
class SparkCassandra:
    appNameCassandra = "WikiOlapCassandra"
    appNameSQL = "WikiOlapSQL"
    master = "spark://"+socket.gethostname()+":7077"

    confCassandra = SparkConf() \
        .setAppName(appNameCassandra) \
        .setMaster(master) \
        .set("spark.cassandra.connection.host", os.environ['CASSANDRA_PORT_9042_TCP_ADDR'])


    sc = CassandraSparkContext(conf=confCassandra)
    sqlContext = SQLContext(sc)
예제 #7
0
 def __init__(self):
     self.spark_config = SparkConf()\
         .setMaster("local[4]")\
         .setAppName("Popularity")\
         .set("spark.cassandra.connection.host", "127.0.0.1")
     self.sparkContext = CassandraSparkContext(conf=self.spark_config)
     self.cluster = Cluster()
     self.session = self.cluster.connect("music_recommendation")
     self.raw_data = self.session.execute("SELECT song_id, timestamp "
                                          "FROM user_event "
                                          "WHERE action_type='listen';")
     self.session.execute("DROP TABLE IF EXISTS result_popularity ;")
     self.session.execute("CREATE TABLE IF NOT EXISTS result_popularity ("
                          "sid text PRIMARY KEY,"
                          "rank int);")
     self.current_year = datetime.datetime.now().year
     self.current_month = datetime.datetime.now().month
예제 #8
0
    def __init__(self):
        self.spark_config = SparkConf() \
            .setMaster("local[4]") \
            .setAppName("ContentBased") \
            .set("spark.cassandra.connection.host", "127.0.0.1")
        self.sparkContext = CassandraSparkContext(conf=self.spark_config)

        self.cluster = Cluster()
        self.session = self.cluster.connect("music_recommendation")

        cql_cmd = "SELECT * FROM %s"
        cmd = cql_cmd % "i_profile_artist"
        self.i_artists_res = self.session.execute(cmd)
        cmd = cql_cmd % "i_profile_composer"
        self.i_composers_res = self.session.execute(cmd)
        cmd = cql_cmd % "i_profile_genre"
        self.i_genres_res = self.session.execute(cmd)
        cmd = cql_cmd % "u_profile_artist"
        self.u_artists_res = self.session.execute(cmd)
        cmd = cql_cmd % "u_profile_composer"
        self.u_composers_res = self.session.execute(cmd)
        cmd = cql_cmd % "u_profile_genre"
        self.u_genres_res = self.session.execute(cmd)

        cql_cmd = "SELECT uid, song_id FROM %s"
        events = self.session.execute(cql_cmd % "user_event")
        self.events = dict()
        for event in events:
            songs = self.events.get(event.uid)
            if songs is None:
                self.events[event.uid] = [event.song_id]
            else:
                self.events[event.uid].append(event.song_id)

        self.session.execute("CREATE TABLE IF NOT EXISTS "
                             "result_cb_user_item_genre ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")
        self.session.execute("CREATE TABLE IF NOT EXISTS "
                             "result_cb_user_item_artist ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")
        self.session.execute("CREATE TABLE IF NOT EXISTS "
                             "result_cb_user_item_composer ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")
예제 #9
0
 def __init__(self):
     self.sparkConfig = SparkConf()\
         .setMaster("local[4]")\
         .setAppName("MCF")\
         .set("spark.cassandra.connection.host", "127.0.0.1")\
         .set("spark.cassandra.input.consistency.level", "LOCAL_ONE")
     self.sparkContext = CassandraSparkContext(conf=self.sparkConfig)
     self.rank = 10
     self.numIteration = 10
     self.numberOfPreds = 10
     self.cluster = Cluster()
     self.session = self.cluster.connect("music_recommendation")
     self.rawData = self.session.execute("SELECT uid, song_id, payload "
                                         "FROM user_event "
                                         "WHERE action_type='rate'")
     self.session.execute("CREATE TABLE IF NOT EXISTS result_cf ("
                          "uid text PRIMARY KEY,"
                          "recommendations list<text>);")
예제 #10
0
def main():
    pwords = load_wordlist("../Dataset/positive.txt")
    nwords = load_wordlist("../Dataset/negative.txt")
    sterms = load_wordlist("../Dataset/keyWords.txt")
    conf = SparkConf().\
        setMaster("local[2]").\
        setAppName("TweeStreamer").\
        set("spark.cassandra.connection.host",\
        "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")
    sc = CassandraSparkContext(conf=conf)
    sc.setLogLevel("WARN")

    # Creating a streaming context with batch interval of 10 sec
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    kstream = KafkaUtils.createDirectStream(
        ssc,
        topics=['twitter-topic1'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})

    tweets = kstream.map(lambda x: json.loads(x[1]))
    tweets.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint()
    tweetsUsentiment = tweets.map(
        lambda tweet: tweetwithSentiment(tweet, pwords, nwords, sterms))

    searchTermUsentiment = tweetsUsentiment.flatMap(
        lambda tweet: searchTermFunction(tweet, sterms)).reduceByKey(
            lambda a, b: a + b)
    searchTermUsentiment = searchTermUsentiment.map(
        lambda (key, value): {
            "searchterm": "_" + key,
            "insertion_time": datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S'),
            "sentiment": value
        })
    searchTermUsentiment.pprint()

    searchTermUsentiment.saveToCassandra("tweetdb", "searchtermtable")
    # searchTermSentiment = tweetsUsentiment.map(lambda tweet: searchTermFunction(tweet,sterms))

    ssc.start()
    ssc.awaitTerminationOrTimeout(1000)
    ssc.stop(stopGraceFully=True)
--conf spark.cassandra.connection.host=52.32.192.156,52.32.200.206,54.70.213.12 \
/home/ubuntu/pipeline/kafka_spark_cass_imageQuery.py localhost:2181 imgSearchRequests

#Opening spark shell with cassandra
$SPARK_HOME/bin/pyspark \
--master spark://ip-172-31-0-173:7077 \
--packages TargetHolding/pyspark-cassandra:0.3.5 \
--conf spark.cassandra.connection.host=52.32.192.156,52.32.200.206,54.70.213.12
"""


db_table=0 #global rdd
producer = KafkaProducer(bootstrap_servers = 'ec2-52-41-224-1.us-west-2.compute.amazonaws.com:9092', value_serializer=lambda v: json.dumps(v).encode('ascii'))
# Kafka and Spark Streaming specific vars
batch_interval = 5 #question, why is batch interval of 5 so much better than 3? 3 seemed like needed to wait a long time
sc = CassandraSparkContext(appName="PythonStreamingVSS") #http://www.slideshare.net/JonHaddad/intro-to-py-spark-and-cassandra
ssc = StreamingContext(sc, batch_interval)
keyspace="vss_large"

"""
Example usages:
db_table.select("hashvalue", "partitionby","videoname").map(lambda x: x['hashvalue']).take(3)
will result in
[u'6daab6a32cb6b209', u'77a888d7aa2f882b', u'571d23371cc358d5']
"""

def main():
    global db_table;
    global producer;
    if len(sys.argv) != 3:
        #print("Usage: thisfile.py <zk> <topic>", file=sys.stderr) #i get an error about file=sys.stderr for some reason
예제 #12
0
                                  'test_93').on('name').collect())

        self.assertEqual(len(joined), 2)


if __name__ == '__main__':
    try:
        # connect to cassandra and create a keyspace for testing
        CassandraTestCase.session = Cluster().connect()
        CassandraTestCase.session.execute('''
            CREATE KEYSPACE IF NOT EXISTS %s WITH
            replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
        ''' % (CassandraTestCase.keyspace, ))
        CassandraTestCase.session.set_keyspace(CassandraTestCase.keyspace)

        # create a cassandra spark context
        CassandraTestCase.sc = CassandraSparkContext(
            conf=SparkConf().setAppName("PySpark Cassandra Test"))

        # perform the unit tests
        unittest.main()
        # suite = unittest.TestLoader().loadTestsFromTestCase(RegressionTest)
        # unittest.TextTestRunner().run(suite)
    finally:
        # stop the spark context and cassandra session
        # stop the spark context and cassandra session
        if hasattr(CassandraTestCase, 'sc'):
            CassandraTestCase.sc.stop()
        if hasattr(CassandraTestCase, 'session'):
            CassandraTestCase.session.shutdown()
예제 #13
0
    return datetime.today().strftime("%Y-%m-%d %H:%M:%S")


def process(rdd):
    spark = getSparkSessionInstance(rdd.context.getConf())
    tweetsDataFrame = spark.read.json(rdd)
    df = tweetsDataFrame.withColumn('hashtag', func(tweetsDataFrame.text))
    df = df.withColumn('date', transfer_time(tweetsDataFrame.time))
    df.createOrReplaceTempView("historicaltweets")
    df = spark.sql(
        "SELECT MAX(date) AS date,hashtag,count(*) AS count FROM historicaltweets WHERE hashtag IS NOT NULL GROUP BY hashtag ORDER BY count DESC"
    )
    rdd = df.rdd.map(tuple)
    rdd.saveToCassandra("twitter", "tweet")
    df.show()


if __name__ == "__main__":
    sc = CassandraSparkContext(appName="tweet")
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 600)
    topic_name = "twitter"
    streamFromKafka = KafkaUtils.createDirectStream(
        ssc, [topic_name], {"metadata.broker.list": '*'})
    lines = streamFromKafka.map(lambda x: x[1])
    lines.count().pprint()
    lines.foreachRDD(process)
    #text_counts = lines.map(lambda tweet: (tweet['hashtag'],1)).reduceByKey(lambda x,y: x + y)
    ssc.start()
    ssc.awaitTermination()
from pyspark_cassandra import CassandraSparkContext
from pyspark import SparkConf
conf = SparkConf()
conf.set("spark.cassandra.connection.host", "192.168.15.87")
sc = CassandraSparkContext("spark://192.168.15.87:7077",
                           "Simple App",
                           conf=conf)
rdd = sc.cassandraTable("testkeyspace",
                        "stock2").select("ric", "time_stamp", "high",
                                         "low").spanBy('time_stamp').collect()

for gr in rdd:
    print(gr)  #one batch
    print("+++++++++++++++")
    
    # Connect to Cassandra
    cluster = Cluster(["127.0.0.1"])
    session = cluster.connect()
    
    # Initialize keyspace in Cassandra
    session.execute("CREATE KEYSPACE IF NOT EXISTS meetups_space WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };")
    
    # Initialize table in Cassandra for country and event statistics
    session.execute("CREATE TABLE IF NOT EXISTS meetups_space.country_statistics (event_country text PRIMARY KEY, total_rsvps int);")
    session.execute("CREATE TABLE IF NOT EXISTS meetups_space.response_statistics (response text PRIMARY KEY, total_rsvps int);")
    session.execute("CREATE TABLE IF NOT EXISTS meetups_space.event_statistics (event_name text, group_name text, event_country text, response text, total_rsvps int, PRIMARY KEY ((event_name), response));")
  
    # Configured the spark stream
    conf = SparkConf().set("spark.cassandra.connection.host", "localhost")
    sc = CassandraSparkContext(appName='MeetupDashboard', conf=conf)
    ssc = StreamingContext(sc, 5)
    ssc.checkpoint("/tmp")
    topic = ["meetuptopic"]
    kafkaConf = {"metadata.broker.list": "localhost:9092",
			"zookeeper.connect": "localhost:2181",
			"group.id": "kafka-spark-streaming",
			"zookeeper.connection.timeout.ms": "1000"}
    
    # DStream
    messages = KafkaUtils.createDirectStream(ssc, topic, kafkaConf)
    
    # Pre-process DStream
    lines = messages.map(lambda (key, values): json.loads(values))
    
    # Compute country statistics
예제 #16
0
#!/usr/bin/python3
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.sql import SQLContext
from pyspark_cassandra import CassandraSparkContext, Row
import pandas

# Configuracion
sc = CassandraSparkContext()
sqlContext = SQLContext(sc)


# Carga una tabla en un keyspace determinado
# retorna un dataframe (que después se puede pasar a un RDD)
def load_and_get_table_df(keys_space_name, table_name):
    table_df = sqlContext.read\
        .format("org.apache.spark.sql.cassandra")\
        .options(table=table_name, keyspace=keys_space_name)\
        .load()
    return table_df


# Convierte datos de un RDD a un DataFrame
def rdd_to_df(data):
    new_data = data.map(lambda row: (row, ))
    return sqlContext.createDataFrame(new_data)


# Pasos que procesan data para el calculo de tfidf
def tfidf(data):
    hashing = HashingTF()
예제 #17
0
from pyspark import SparkConf, SparkContext
import pyspark_cassandra
from pyspark_cassandra import CassandraSparkContext

conf = SparkConf()\
 .setAppName("PySpark Cassandra Test") \
 .setMaster("local[2]") \
        .set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")
#	.set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")

sc = CassandraSparkContext(conf=conf)
print((sc.cassandraTable(
    "tweetdb", "tweettable").select("tweet").map(lambda a: a).collect()))
#sc.pprint()

#rdd = sc.parallelize([{"tweet":"first second third tweet"}])

#rdd.saveToCassandra(
#	"tweetdb",
#	"tweettable")
예제 #18
0
start_time = datetime.datetime(2010, 10, 7, 0, 0)
sample_frequency = datetime.timedelta(minutes=30)
num_tests = 2
list_times = [start_time + x * sample_frequency for x in range(num_tests)]
num_of_meters = 30
window_size = datetime.timedelta(hours=24)
init_model_params = {}
meter_ids = get_meters()
mk = 3
lrate = 0.75
SE = 0
i = 1

program_start_time = t.time()

sc = CassandraSparkContext(appName="PySpark Cassandra Test", master="local[*]")
'''DataFrame Tests'''
#for current_time in list_times:
current_time = list_times[0]
readings = sc \
    .cassandraTable("cer", "readings") \
    .select("meter_id", "date", "measurement") \
    .where("date <= '{}' AND date >= '{}'".format(current_time, current_time-mk*sample_frequency))\
    .map(lambda x: (x["meter_id"], (x["date"], x["measurement"])))\
    .groupByKey()\
    .mapValues(lambda x: pd.Series(list(i[1] for i in x), index=list(i[0] for i in x)))

model_parameters = sc \
    .cassandraTable("cer", "models") \
    .map(lambda x: (x["meter_id"], np.asanyarray(x["w"])))