示例#1
0
def feeder(start_date, end_date, e1, e2, q):

    conf = SparkConf().setAppName("Simple App").setMaster(
        "spark://127.0.0.1:7077").set("spark.cassandra.connection.host",
                                      "127.0.0.1")

    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    a = ""
    l = ['"SP1"', '"SP2"']
    asia = pytz.timezone("Asia/Kolkata")

    #creating a dataframe for the date range and ric names
    rdd = sc.cassandraTable("testkeyspace", "stock_test").select(
        "ric", "time_stamp", "high",
        "low").where("ric in ?", ["SP1", "SP2", "SP3"]).where(
            "time_stamp > ? and time_stamp < ?",
            datetime(2010, 11, 26, 12, 30, tzinfo=asia),
            datetime(2010, 12, 10, 12, 30, tzinfo=asia)).toDF()
    # making a batch according to the time_stamp
    rdd = rdd.orderBy("time_stamp").groupBy("time_stamp").agg(
        collect_list(struct('ric', 'time_stamp', 'high', 'low'))).collect()
    # sending one batch to analytical engine
    for gr in rdd:
        e2.clear()
        send = gr[1]
        q.put(send)  #adding the batch to the queue
        e2.set()
        e1.wait()
def run_driver(keyspace, table):
    conf = SparkConf().setAppName("PySpark Cassandra Sample Driver")
    conf.set("spark.cassandra.connection.host", "127.0.0.1")
    sc = CassandraSparkContext(conf=conf)

    # Read some data from Cassandra
    pixels = sc.cassandraTable(keyspace, table)
    print pixels.first()

    # Count unique visitors, notice that the data returned by Cassandra is
    # a dict-like, you can access partition, clustering keys as well as
    # columns by name. CQL collections: lists, sets and maps are converted
    # to proper Python data types
    visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\
                .distinct()
    print "Visitors: {:,}".format(visitors.count())

    # Insert some new pixels into the table
    pixels = (
        {
            "customer_id": "example.com",
            "url": "http://example.com/article1/",
            "hour": dt.datetime(2014, 1, 2, 1),
            "ts": dt.datetime(2014, 1, 2, 1, 8, 23),
            "pixel_id": str(uuid4()),
            "data": {"visitor_id": "xyz"}
        },
    )
    saveToCassandra(sc.parallelize(pixels), keyspace, table)
    print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
示例#3
0
def run_driver(keyspace, table, cass_host):
    conf = SparkConf().setAppName("PySpark Cassandra Sample Driver")
    conf.set("spark.cassandra.connection.host", cass_host)
    sc = CassandraSparkContext(conf=conf)

    # Read some data from Cassandra
    pixels = sc.cassandraTable(keyspace, table)
    print pixels.first()

    # Count unique visitors, notice that the data returned by Cassandra is
    # a dict-like, you can access partition, clustering keys as well as
    # columns by name. CQL collections: lists, sets and maps are converted
    # to proper Python data types
    visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\
                .distinct()
    print "Visitors: {:,}".format(visitors.count())

    # Insert some new pixels into the table
    pixels = ({
        "customer_id": "example.com",
        "url": "http://example.com/article1/",
        "hour": dt.datetime(2014, 1, 2, 1),
        "ts": dt.datetime(2014, 1, 2, 1, 8, 23),
        "pixel_id": str(uuid4()),
        "data": {
            "visitor_id": "xyz"
        }
    }, )
    saveToCassandra(sc.parallelize(pixels), keyspace, table)
    print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
示例#4
0
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-device.py
"""

if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-device.py ", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
 .setAppName("spark-calculate-device") \
 .set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    sql = SQLContext(sc)

    while True:
        rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select(
            "config_device", "fsa")
        if rdd.isEmpty() == False:
            x = rdd.toDF().dropDuplicates(['fsa'])
            x = x.groupBy(['config_device']).count()
            array = []
            for row in x.collect():
                x = {
                    'config_device': row['config_device'],
                    'device_count': row['count'],
                    'bucket': 3
                }
                array.append(x)

            result = sc.parallelize(array)
            result.saveToCassandra('web_analytic', 'device_report')
            # break
    
    col_tenant_id = 1
    col_user_id = 2
    col_item_id = 3

    num_to_recomm_per_user = 10
    num_to_recomm_per_item = 10
    conf = SparkConf().setAppName("PysparkCollaborativeFiltering").set("spark.cassandra.connection.host", spark_cassandra_connection_host)
    print ('conf')
    sc = CassandraSparkContext(conf=conf)
    sc.setCheckpointDir('checkpoint/')
    
    if LOAD_DATA_FROM_DB:
        
        data = sc.cassandraTable(cassandra_keyspace, cassandra_table, row_format=1).collect() # row_format: tuple
        # (id, tenant_id, user_id, item_id)
        tenant_ids = set(list(map(lambda x:x[col_tenant_id],data)))
        data_rdd = sc.parallelize(data)
        # data_rdd = sc.parallelize(data).map(list)
        
        all_results_per_user = sc.emptyRDD()
        all_results_per_item = sc.emptyRDD()
        
        for t_id in tenant_ids:
            print("\nComputing recommendation for tenant {}...\n".format(t_id))
            per_tenant_rdd = data_rdd.filter(
                lambda x: x[col_tenant_id] == t_id).map(
                lambda l: ((l[col_user_id],l[col_item_id]),1.0)).reduceByKey(
                lambda x,y: x + y).map(
                lambda x: (x[0][0],x[0][1],x[1]))
示例#6
0
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-pageview-total-user", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
	.setAppName("spark-calculate-pageview-total-user") \
	.set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    # i = 1514505600
    # while i <= 1514764800:
    while True:
        current_date = getGMT()
        future_date = getNextGMT()
        # date_temp = i
        # i = i + 86400
        raw = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","userid","fsa","fsid","location_path")
        if raw.isEmpty() == False:
            df = raw.toDF()
            current_day = df.filter( df.m_date >= current_date ).filter(df.m_date < future_date).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid')
            previous_day =  df.filter(df.m_date < current_date).select('fsa','fsid')
            # current_day = df.filter( df.m_date >= date_temp ).filter(df.m_date < i).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid')
            # previous_day =  df.filter(df.m_date < date_temp).select('fsa','fsid')
            result_new_user = current_day.subtract(previous_day)
            total_newuser = result_new_user.count()
            result_newuser = sc.parallelize([{
                "bucket":1,
                "m_date": int(current_date),
                # "m_date": int(date_temp),
                "newusers": int(total_newuser)
            }])
            
示例#7
0
#	X varchar,
#	Y varchar,
#	Count int,
#	PRIMARY KEY(ZipCode, TaxonomyCode1, ProviderNumber)
#);

if __name__ == "__main__":

    conf = SparkConf().setAppName("Query App").setMaster(
        "spark://spark01.cs.ucr.edu:7077")
    sc = CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # pulling data from cassandra

    x = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\
        .map(lambda x: (x[0], x[1], x[2]))

    # BusinessPracticeLocationPostalCode -> ZipCode in providers.taxonomy_count
    y = sc.cassandraTable("providers", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("businesspracticelocationpostalcode", "taxonomycode1")\
 .map(lambda x: (x[0], x[1]))

    df_x = sqlContext.createDataFrame(x)
    df_y = sqlContext.createDataFrame(y)

    # joining x and y
    # (zipcode, pop, meanincome, zipcode, businesspracticelocationpostalcode, taxonomycode1)
    # -> (zipcode, pop, meanincome, taxonomycode1)
    # -> (zipcode, pop, meanincome, taxonomycode1, count)


    def x_y_joinSeparator():
from pyspark_cassandra import CassandraSparkContext, Row
from pyspark import SparkContext, SparkConf
from subprocess import call
import subprocess
import commands
aa=commands.getstatusoutput("b=0")


conf = SparkConf() \
    .setAppName("User Food Migration") \
    .setMaster("spark://128.138.202.110:7077") \
    .set("spark.cassandra.connection.host", "128.138.202.117")

sc = CassandraSparkContext(conf=conf)

users = sc.cassandraTable("junk", "trump2")
trump = users.map(lambda x:
	       {"tweet_id":x['tweet_id'],
		"tweet":x['tweet']} )





#to access Twitter API
consumer_key = "43b4urzsW8nMY3oGzB5tIIM8B"
consumer_secret = "fbGLMhkFyipYbTAz0s0S6yrN6cDGGWnEMmNaciceYjr4sgEdP2"
garbage = 0

access_token = "2990432317-eYMpYm2Ck2G1YBPvWEq7Mf9wdgzBlOydabaxmzN"
access_token_secret = "lQYcmiMlFdic9KSdmd6PClGQ3Swq8y9BgvVPOmqwhHjV2"
    mongo_client.drop_database(db_out)
    mongo_client.close()
    print 'database cleared'
    
    col_tenant_id = 1
    col_user_id = 2
    col_item_id = 3

    num_to_recomm_per_user = 10
    num_to_recomm_per_item = 10
    
    
    conf = SparkConf().setAppName("PysparkCollaborativeFiltering").set("spark.cassandra.connection.host", spark_cassandra_connection_host)
    sc = CassandraSparkContext(conf=conf)
    sc.setCheckpointDir('checkpoint/')
    data = sc.cassandraTable("mykeyspace", "transactions",row_format=1).collect() # row_format: tuple
    # (id, tenant_id, user_id, item_id)
    tenant_ids = set(list(map(lambda x:x[col_tenant_id],data)))
    data_rdd = sc.parallelize(data)
    # data_rdd = sc.parallelize(data).map(list)
    
    all_results_per_user = sc.emptyRDD()
    all_results_per_item = sc.emptyRDD()
    
    for t_id in tenant_ids:
        print("\nComputing recommendation for tenant {}...\n".format(t_id))
        per_tenant_rdd = data_rdd.filter(
            lambda x: x[col_tenant_id] == t_id).map(
            lambda l: ((l[col_user_id],l[col_item_id]),1.0)).reduceByKey(
            lambda x,y: x + y).map(
            lambda x: (x[0][0],x[0][1],x[1]))
示例#10
0
sqlContext = SQLContext(sc)

# Make Spark less verbose
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)


def parsePoint(data):
    #return LabeledPoint(data[3],np.append(data[0:3],data[4:]))
    return LabeledPoint(data[0], data[1:])


# store the data from cassandra to a data frame and remove the NA value
data = sc.cassandraTable("msd_01",
                         "songs").select("song_hotttnesss", "loudness", "year",
                                         "sentiment", "tempo",
                                         "unique_words").toDF()

data = data.filter("year>0").na.drop()
print data.count()

# Scale the features with Standard Scaler
data2 = data.map(lambda x: [
    x.song_hotttnesss, x.loudness, x.year, x.sentiment, x.tempo, x.unique_words
])  #Convert each sql.row to an array
scaler = StandardScaler(withMean=True, withStd=True).fit(
    data2)  #fit a scaler on the every column
scaledData = scaler.transform(data2)  # transform our data

# Transform to a labelled vector
parsedData = scaledData.map(parsePoint)
示例#11
0
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf=conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testTogether")

# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments


def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
    dtf = loc.localize(datetime.strptime(_from, "%Y-%d-%m %H:%M"))
    dtt = loc.localize(datetime.strptime(_to, "%Y-%d-%m %H:%M"))
示例#12
0
from pyspark_cassandra import CassandraSparkContext, Row
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext


conf = SparkConf().setAppName("NBARetrieval") \
    .set("spark.cassandra.connection.timeout_ms","20000") \
    .set("spark.cassandra.connection.host", "192.168.0.10") \
    .set("spark.cassandra.auth.username", "mdi") \
    .set("spark.cassandra.auth.password", "W2yIJw6ntl5RYC54VChe3lJoXa")


sc = CassandraSparkContext(conf=conf)
rdd = sc.cassandraTable("test", "kv")

print rdd.first()
    
    mongo_client= MongoClient()
    mongo_client.drop_database(db_out)
    # print 'database cleared'


    num_to_recomm_per_user = 10
    num_to_recomm_per_item = 10
    conf = SparkConf().setAppName("PysparkCollaborativeFiltering")
    print 'conf'
    sc = CassandraSparkContext(conf=conf)
    sc.setCheckpointDir('checkpoint/')
    
    if LOAD_DATA_FROM_DB:
        
        data_rdd = sc.cassandraTable(cassandra_keyspace, cassandra_table) # row_format: Row
        # print data

        t1 = time.time()
        tenant_ids = data_rdd.map(lambda trans:trans[col_tenant_id]).distinct().collect()
        elapsed = (time.time() - t0)
        print ("\nIt took %.2fsec to complete" % elapsed)

        t1 = time.time()
        cluster = Cluster()
        session = cluster.connect(cassandra_keyspace)
        string = 'SELECT DISTINCT ' + col_tenant_id + ' from ' +  cassandra_table
        tenant_ids = ession.execute(string)
        elapsed = (time.time() - t0)
        print ("\nIt took %.2fsec to complete" % elapsed)
conf = SparkConf().setAppName("Regression on Song Hotness Analysis").setMaster("spark://muziki:7077")
sc= CassandraSparkContext(conf=conf)
sqlContext = SQLContext(sc)

# Make Spark less verbose
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )

def parsePoint(data):
	#return LabeledPoint(data[3],np.append(data[0:3],data[4:]))
	return LabeledPoint(data[0],data[1:])

# store the data from cassandra to a data frame and remove the NA value 
data=sc.cassandraTable("msd_01", "songs").select("song_hotttnesss","loudness","year","sentiment","tempo","unique_words").toDF()

data=data.filter("year>0").na.drop()
print data.count()


# Scale the features with Standard Scaler
data2=data.map(lambda x: [x.song_hotttnesss, x.loudness,x.year, x.sentiment,x.tempo,x.unique_words])#Convert each sql.row to an array
scaler= StandardScaler(withMean=True, withStd=True).fit(data2) #fit a scaler on the every column
scaledData = scaler.transform(data2)# transform our data

# Transform to a labelled vector
parsedData = scaledData.map(parsePoint)

# # Build the model
model = LinearRegressionWithSGD.train(parsedData, iterations=1000,regParam=1.0,regType="l2",intercept=True)
示例#15
0

"""
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 recommendation_engine/backup_user_event.py
"""

if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: last_like.py ", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
 .setAppName("last-like") \
 .set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    sql = SQLContext(sc)
    while True:
        current_date = getGMT()
        future_date = getNextGMT()
        rdd = sc.cassandraTable("db","user_event_model").select("idx_user","idx_movie","rating","time","type_event")\
                .filter(lambda x: current_date <= int(x['time']) < future_date)
        if rdd.isEmpty() == False:
            rdd.toDF().write\
                .format('com.databricks.spark.csv')\
                .option("header", "true")\
                .save('/home/trantu/Desktop/engine_recommendation.git/trunk/meta-data/user_event/' \
                        +datetime.now().strftime('%Y_%m_%d_%H_%M_%S.csv'))
            # rdd.toDF().write.csv('/home/tutn6/Desktop/engine_recommendation.git/trunk/mycsv')
            # rdd.deleteFromCassandra("db","user_event_model")
        break
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-pageview", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
	.setAppName("spark-calculate-pageview") \
	.set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    i = 1511740800
    while i <= 1514505600:
    # while True:
        date_temp = i
        i = i + 86400
        # current_date = getGMT()
        # future_date = getNextGMT()
        rdd = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","location_path")\
                .filter(lambda x: date_temp <= int(x['m_date']) < i)
        if rdd.isEmpty() == False:
            x = rdd.toDF().groupBy(['location_path']).count()
            # x.show()
            array = []
            for row in x.collect():
                x = {
                    'location_path': row['location_path'], 
                    'm_date': date_temp, 
                    'count':row['count'],
                    'bucket':5}
                array.append(x)     
            result = sc.parallelize(array)
            result.saveToCassandra('web_analytic','page_view_report')
            break
            pass
示例#17
0
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-language.py
"""

if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-language.py ", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
 .setAppName("spark-calculate-language") \
 .set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    sql = SQLContext(sc)

    while True:
        rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select(
            "location_browser_lan", "fsa")
        if rdd.isEmpty() == False:
            x = rdd.toDF().dropDuplicates(['fsa'])
            x = x.groupBy(['location_browser_lan']).count()
            array = []
            for row in x.collect():
                x = {
                    'browser_language': row['location_browser_lan'],
                    'count': row['count'],
                    'bucket': 6
                }
                array.append(x)

            result = sc.parallelize(array)
            result.saveToCassandra('web_analytic', 'browser_language_report')
            # break
示例#18
0
from pyspark_cassandra import CassandraSparkContext

from operator import add

start_time = time.time()

if __name__ == "__main__":

    conf = SparkConf().setAppName("Query App").setMaster(
        "spark://spark01.cs.ucr.edu:7077")
    sc = CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # pulling data from cassandra

    x = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\
        .map(lambda x: (x[0], x[1], x[2])) \

    y = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "numberofdoctors")\
 .map(lambda x: (x[0], x[1]))\
 .reduceByKey(lambda x,y: x + y)

    df_x = sqlContext.createDataFrame(x)
    df_y = sqlContext.createDataFrame(y)

    # (zipcode, pop, meanincome, zipcode, numberofdoctors) -> (zipcode, pop, meanincome, numberofdoctors, pop/numberofdoctors)
    def joinSeparator():
        return lambda x: (x[0], x[1], x[2], x[4], float(x[1]) / x[4])

    cond = [df_x._1 == df_y._1]
    popDoctorRatioQueryRDD = df_x.join(df_y, cond) \
   .map(joinSeparator()) \
from pyspark_cassandra import CassandraSparkContext
from pyspark import SparkConf
conf = SparkConf()
conf.set("spark.cassandra.connection.host", "192.168.15.87")
sc = CassandraSparkContext("spark://192.168.15.87:7077",
                           "Simple App",
                           conf=conf)
rdd = sc.cassandraTable("testkeyspace",
                        "stock2").select("ric", "time_stamp", "high",
                                         "low").spanBy('time_stamp').collect()

for gr in rdd:
    print(gr)  #one batch
    print("+++++++++++++++")
示例#20
0
from operator import add

start_time = time.time()

if __name__ == "__main__":
        
    conf = SparkConf().setAppName("Query App").setMaster("spark://spark01.cs.ucr.edu:7077")
    sc = CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # pulling data from cassandra
    
    # num of hospitals RDD (zipcode, numofhospitals)
    x = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode")\
	.map(lambda x: (x, 1))\
	.reduceByKey(add)\
	.map(lambda x: (x[0][0], x[1]))

    y = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "providernumber", "hospitalname", "x", "y", "numberofdoctors")\
        .map(lambda x: (x[0], x[1], x[2], x[3], x[4], x[5]))

    z = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\
        .map(lambda x: (x[0], x[1], x[2]))

    df_x = sqlContext.createDataFrame(x)
    df_y = sqlContext.createDataFrame(y)
    df_z = sqlContext.createDataFrame(z)
    
    # (zipcode, numofhospitals, zipcode, providernumber, hospitalname, x, y, numberofdoctors) -> (zipcode, numofhospitals, providernumber, hospitalname, x, y, numberofdoctors)
    def x_y_joinSeparator(): return lambda x: (x[0], x[1], x[3], x[4], x[5], x[6], x[7])
#	Y varchar,
#	Count int,
#	MeanIncome int,
#	PRIMARY KEY(ZipCode, TaxonomyCode1, ProviderNumber)
#);

if __name__ == "__main__":

    conf = SparkConf().setAppName("Query App").setMaster(
        "spark://spark01.cs.ucr.edu:7077")
    sc = CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # pulling data from cassandra

    x = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\
        .map(lambda x: (x[0], x[1], x[2]))

    y = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "providernumber", "hospitalname", "x", "y")\
        .map(lambda x: (x[0], x[1], x[2], x[3], x[4]))

    # BusinessPracticeLocationPostalCode -> ZipCode in providers.taxonomy_count
    z = sc.cassandraTable("providers", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("businesspracticelocationpostalcode", "taxonomycode1")\
 .map(lambda x: (x[0], x[1]))

    # num of hospitals RDD (zipcode, numberofhospitals)
    a = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode")\
 .map(lambda x: (x, 1))\
 .reduceByKey(add)\
 .map(lambda x: (x[0][0], x[1]))

    df_x = sqlContext.createDataFrame(x)
示例#22
0
if __name__ == "__main__":
	conf = SparkConf().setAppName("UserUserRelevance").setMaster(config.SPARK_MASTER).set("spark.cassandra.connection.host", config.CASSANDRA_SEED_NODE_IP)
	sc = CassandraSparkContext(conf=conf)
	
	filename = datetime.now().strftime("%Y-%m-%d")+"-usersonglog.txt"

	users = sc.textFile(config.HDFS_URL+":"+config.HDFS_PORT+config.LOG_FOLDER+filename) \
						.filter(time_range_filter) \
						.map(parse_log_entry) \
						.keys() \
						.collect()

	song_map = {} # store song to user mapping for use in later stages

	usersongdb = sc.cassandraTable(config.CASSANDRA_KEYSPACE, "user_to_song")
	songuserdb = sc.cassandraTable(config.CASSANDRA_KEYSPACE, "song_to_user")

	for user in users:
		user_suggest = []
		song_list = usersongdb.select("song_id") \
				.where("user_id=? and req_time > ? and req_time < ?", int(user), five_weeks_back, now+1) \
				.map(lambda row: row.song_id) \
				.distinct() \
				.collect()
		songs = list(set(song_list))

		for song in songs:
			if song in song_map:
				listeners = song_map[song]
			else:
示例#23
0
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-total-user.py
"""
if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: spark-process-data", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
 .setAppName("spark-calculate-total-user") \
 .set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)

    while True:
        current_date = getGMT()
        future_date = getNextGMT()
        rdd = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","userid","fsa","fsid")\
                .filter(lambda x: current_date <= int(x['m_date']) < future_date)

        if rdd.isEmpty() == False:
            table = rdd.toDF()
            # table.show(truncate=False)
            total = table.dropDuplicates(['fsa', "fsid"]).count()

            result = sc.parallelize([{
                "bucket": 0,
                "m_date": int(current_date),
                "users": int(total)
            }])
        else:
            result = sc.parallelize([{
                "bucket": 0,
                "m_date": int(current_date),
示例#24
0
		
	result = ''
	for i in range(3000):
		if vecFirst[0] > 0:
			result = '%s;%d %.4f' % (result, i, vecFirst[i])
	return result
	

conf = SparkConf()\
	.set("spark.cassandra.connection.host", "localhost")

sc = CassandraSparkContext(conf=conf)

vecSum = sc.cassandraTable('reishi', 'dockmeans')\
	.select("cluster_id", "vector")\
	.where("cluster_id=?", 0)\
	.map(lambda x: (x['cluster_id'], x['vector']))\
	.reduceByKey(lambda x, y: maxVector(x, y))\
	.collect()
	
vector = []
#print(vecSum)

v = vecSum[0]
v = v[1]
#print('=================================' + v)
splt = v.split(';')
for t in splt:
	p = t.split(' ')
	if len(p) > 1:
		tp = tpVector(int(p[0]), float(p[1]))
		vector.append(tp)
示例#25
0
sc = CassandraSparkContext(conf=conf)


def retTuple(r):
    age = int(r["age"])
    if age < 20:
        return ("<20", 1)
    if age < 40:
        return ("20 < 40", 1)
    if age < 60:
        return ("40 < 60", 1)
    return (">60", 1)

result = sc.cassandraTable("zeus", "node") \
    .select("age") \
    .where("type=?", "person") \
    .map(retTuple) \
    .reduceByKey(lambda a, b: a + b) \
    .collect()

print
print "================================"
print "AGE DEMOGRAPHICS"
print "================================"
for row in result:
    print str(row[0]) + "\t\t" + str(row[1])
print
print "================================"
print
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-location.py
"""

if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-location.py ", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
 .setAppName("spark-calculate-location") \
 .set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    sql = SQLContext(sc)

    while True:
        rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select(
            "location_country_name", "location_country_code", "fsa")
        # rdd.toDF().show()
        if rdd.isEmpty() == False:
            x = rdd.toDF().dropDuplicates(['fsa'])
            x = x.groupBy(['location_country_name',
                           'location_country_code']).count()
            array = []
            for row in x.collect():
                x = {
                    'location_country_name': row['location_country_name'],
                    'location_country_code': row['location_country_code'],
                    'location_count': row['count'],
                    'bucket': 2
                }
                array.append(x)
示例#27
0
from pyspark_cassandra import CassandraSparkContext

from operator import add

start_time = time.time()

if __name__ == "__main__":

    conf = SparkConf().setAppName("Tool App").setMaster(
        "spark://spark01.cs.ucr.edu:7077")
    sc = CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # pulling data from cassandra

    taxonomyRDD = sc.cassandraTable(
        "providers", "test",
        row_format=pyspark_cassandra.RowFormat.TUPLE).select("taxonomycode1")

    customSchema = StructType(
        [StructField("TaxonomyCode1", StringType(), True)])

    taxonomyDF = sqlContext.createDataFrame(taxonomyRDD,
                                            customSchema).distinct()

    with open("/home/cs179g/logs/taxonomyList", "w") as outfile:
        for row in taxonomyDF.rdd.collect():
            outfile.write(row[0] + '\n')

    sc.stop()
示例#28
0
from pyspark_cassandra import CassandraSparkContext
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("ZeusDB") \
    .setMaster("local") \
    .set("spark.cassandra.connection.host", "YOUR_CLUSTER_HOST_NAME")

sc = CassandraSparkContext(conf=conf)

result = sc.cassandraTable("zeus", "edge") \
    .select("destination", "type") \
    .filter(lambda x: x["type"] == "friend") \
    .map(lambda x: (x["destination"], 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .top(10, key=lambda x: x[1])

print
print "================================"
print "TOP 10 PEOPLE WITH MOST FRIENDS"
print "================================"
for row in result:
    print str(row[0]) + "\t\t" + str(row[1])
print
print "================================"
print
    # i = 1514505600
    # while i <= 1515110400:
    while True:
        # date_temp = i
        # i = i + 86400
        current_date = getGMT()
        future_date = getNextGMT()
        rdd = sc.cassandraTable("web_analytic","fsa_log_visit")\
                .select(
                    "config_browser",
                    "config_device",
                    "fsa",
                    "location_browser_lan",
                    "location_country_name",
                    "location_country_code",
                    "m_date",
                    "location_path",
                    "location_city_name",
                    "config_resolution",
                    "location_os"
                    )\
                    .filter(lambda x: current_date <= int(x['m_date']) < future_date)
        # .filter(lambda x: date_temp <= int(x['m_date']) < i)

        # 1514332800
        if rdd.isEmpty() == False:
            table_drop = rdd.toDF().dropDuplicates(['fsa'])
            # table_drop.show()
            # break
            result_config_browser = table_drop.groupBy(['config_browser'
示例#30
0
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
    dtf = loc.localize(datetime.strptime(_from, "%Y-%d-%m %H:%M"))
    dtt = loc.localize(datetime.strptime(_to, "%Y-%d-%m %H:%M"))
    def inner(row):
startDate = str(sys.argv[1])
endDate = str(sys.argv[2])

conf = (
    SparkConf()
    .setAppName("User Food Migration")
    .setMaster("spark://128.138.202.110:7077")
    .set("spark.cassandra.connection.host", "128.138.202.117")
)

sc = CassandraSparkContext(conf=conf)

if __name__ == "__main__":

    rdd = sc.cassandraTable("junk", "bernie4")
    temp = 0
    # returns list of tweets
    listBernie = (
        rdd.filter(lambda row: row.created_at[4:] > startDate)
        .filter(lambda row: row.created_at[4:] < endDate)
        .collect()
    )
    for tweet in listBernie:
        if tweet.retweet_count > 0:
            print tweet.retweet_count
            temp += 1
        if tweet.favorite_count > 0:
            print tweet.favorite_count
            temp += 1
        if tweet.coordinates != None:
from pyspark_cassandra import CassandraSparkContext, Row
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext # needed for toDF()

conf = SparkConf() \
    .setAppName("User Food Migration") \
    .setMaster("spark://127.0.0.1:7077") \
    .set("spark.cassandra.connection.host", "127.0.0.1")

sc = CassandraSparkContext(conf=conf)
sql = SQLContext(sc)

users = sc.cassandraTable("demo", "user").toDF()
food_count = users.select("favorite_food").groupBy("favorite_food").count()
KEYSPACE = "undefined"  # mettez "share" si vous n'avez pas fait l'injection des données

# vérification que le keyspace a bien été défini correctement
if KEYSPACE == "undefined": raise Exception("Vous n'avez pas changé le keyspace, éditez le début du script")


# lancement par spark-submit --py-files /usr/lib/spark/jars/pyspark-cassandra-0.7.0.jar livres.py


# nom de l'application
appName = "TP5 partie Spark/Cassandra"


# bibliothèques pour travailler avec Cassandra
from pyspark import SparkConf
from pyspark_cassandra import CassandraSparkContext

# contexte d'exécution pour spark-submit
conf = SparkConf()  \
       .setAppName(appName) \
       .setMaster("spark://master:7077") \
       .set("spark.cassandra.connection.host", "master")
csc = CassandraSparkContext(conf=conf)


# ouvrir une table Cassandra à l'aide de csc
livres = csc.cassandraTable(KEYSPACE, "livres");

# nombre de livres de Jules Verne
print livres.filter(lambda livre: livre.auteur=="Jules Verne").count()
示例#34
0
from pyspark_cassandra import CassandraSparkContext
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("ZeusDB") \
    .setMaster("local") \
    .set("spark.cassandra.connection.host", "YOUR_CLUSTER_HOST_NAME")

sc = CassandraSparkContext(conf=conf)

result = sc.cassandraTable("zeus", "edge") \
    .select("destination", "visit_count", "type") \
    .filter(lambda x: x["type"] == "visited") \
    .map(lambda x: (x["destination"], int(x["visit_count"]))) \
    .reduceByKey(lambda a, b: a + b) \
    .top(10, key=lambda x: x[1]) \

print
print "================================"
print "TOP 10 FREQUENTLY VISITED PLACES"
print "================================"
for row in result:
    print str(row[0]) + "\t\t" + str(row[1])
print
print "================================"
print
示例#35
0
from operator import add

start_time = time.time()

if __name__ == "__main__":

    conf = SparkConf().setAppName("Query App").setMaster(
        "spark://spark01.cs.ucr.edu:7077")
    sc = CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # pulling data from cassandra

    hospitalNameList = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("hospitalname", "numberofdoctors")\
 .map(lambda x: (x[0], x[1]))\
 .reduceByKey(lambda x,y: x + y)

    #for row in popDoctorRatioRDD:
    #print (row)

    dfList = hospitalNameList.collect()
    # if you do left outer join, the results that are null for the census show up because hospitals have them

    customSchema = StructType([ \
 StructField("HospitalName", StringType(), True), \
        StructField("NumberOfDoctors", IntegerType(), True) ])

    hospitalNameDF = sqlContext.createDataFrame(dfList, customSchema)

    dfJSONRDD = hospitalNameDF.toJSON().collect()
from pyspark_cassandra import CassandraSparkContext, Row
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("PySpark Cassandra Test").set("spark.cassandra.connection.host", "127.0.0.1")
sc = CassandraSparkContext(conf=conf)
data = sc.cassandraTable("mykeyspace", "user",row_format = 1).collect()
rdd = sc.parallelize(data)
print (rdd.collect())
示例#37
0
ETNIA = {
    1: 'BRANCA',
    2: 'PRETA',
    3: 'PARDA',
    4: 'AMARELA',
    5: 'INDÍGENA',
}

# Cria o SparkContext
conf = SparkConf() \
    .setAppName("Pergunta1") \
    .set("spark.cassandra.connection.host", "10.7.40.94")
csc = CassandraSparkContext(conf=conf)

# Prepara os RDDs das tabelas.
candidatos = csc.cassandraTable("eleicoes", "candidatos2014")
resultados = csc.cassandraTable("eleicoes", "resultados2014")

# Busca o código dos candidatos eleitos e dinstintos (para desconsiderar segundo turno).
# Existem 3 tipos de candidatos eleitos (1, 2, 3).

cod_eleitos1 = resultados.select('sq_candidato').where(
    "codigo_sit_cand_tot=? ", 1)
cod_eleitos2 = resultados.select('sq_candidato').where(
    "codigo_sit_cand_tot=? ", 2)
cod_eleitos3 = resultados.select('sq_candidato').where(
    "codigo_sit_cand_tot=? ", 3)

# Une os códigos dos candidatos eleitos (os 3 tipos).
cod_eleitos = cod_eleitos1.union(cod_eleitos2).union(cod_eleitos3).map(
    lambda row: (row['sq_candidato'], 0))
示例#38
0
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testTogether")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
    dtf = loc.localize(datetime.strptime(_from, "%Y-%d-%m %H:%M"))
    dtt = loc.localize(datetime.strptime(_to, "%Y-%d-%m %H:%M"))
    def inner(row):
"""
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-last-like.py
"""

if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-last-like.py ", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
	.setAppName("spark-calculate-last-like") \
	.set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    sql = SQLContext(sc)

    while True:
        rdd = sc.cassandraTable("db","user_event_model").select("config_browser","m_date")
        if rdd.isEmpty() == False:
            x = rdd.toDF().groupBy(['config_browser']).count()
            array = []
            for row in x.collect():
                x = {
                    'config_browser': row['config_browser'], 
                    'browser_count': row['count'],
                    'bucket':4
                    }
                array.append(x)
            
            result = sc.parallelize(array)
            result.saveToCassandra('test','browser_report')
示例#40
0
from pyspark import SparkConf, SparkContext
import pyspark_cassandra
from pyspark_cassandra import CassandraSparkContext

conf = SparkConf()\
 .setAppName("PySpark Cassandra Test") \
 .setMaster("local[2]") \
        .set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")
#	.set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")

sc = CassandraSparkContext(conf=conf)
print((sc.cassandraTable(
    "tweetdb", "tweettable").select("tweet").map(lambda a: a).collect()))
#sc.pprint()

#rdd = sc.parallelize([{"tweet":"first second third tweet"}])

#rdd.saveToCassandra(
#	"tweetdb",
#	"tweettable")
示例#41
0
from pyspark_cassandra import CassandraSparkContext
from pyspark import SparkConf

conf = SparkConf()
conf.set("spark.cassandra.connection.host", "192.168.15.87")
sc = CassandraSparkContext("spark://192.168.15.87:7077",
                           "Simple App",
                           conf=conf)
rdd = sc.cassandraTable("testkeyspace", "stock").select(
    "ric", "date", "time", "high", "low").groupBy(
        lambda r: r["date"] > 20050613 and r["date"] < 20170511).collect()

for gr in rdd:
    if gr[0]:
        new_rdd = sc.parallelize(list(gr[1]))

for time in [
        "9:30:00 AM", "10:30:00 AM", "11:30:00 AM", "12:30:00 PM",
        "1:30:00 PM", "2:30:00 PM"
]:
    rdd_temp = new_rdd.groupBy(lambda r: r["time"] == time)
    for r in rdd_temp.collect():
        if r[0]:
            for i in r[1]:
                print(i)  #each batch