def main(argv=None): if argv is None: keyspace = sys.argv[1] output = sys.argv[2] # initialize spark cassandra cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) sc = pyspark_cassandra.CassandraSparkContext(conf=conf) #Get RDD from Cassandra request = sc.cassandraTable(keyspace, 'nasalogs', row_format=pyspark_cassandra.RowFormat.DICT) \ .map(lambda line: (line['host'],(1, line['bytes']))) request2 = request.reduceByKey(add_tuples) \ .map(lambda (host,(count, byte)): ("r",(1, count, byte, count*count, byte*byte, count*byte))) request3 = request2.reduceByKey(add_tuples) r = request3.map(lambda (key, sixsum): \ (key, (sixsum[0]*sixsum[5] - sixsum[1]*sixsum[2])/(math.sqrt(sixsum[0]*sixsum[3]-sixsum[1]*sixsum[1]) \ * math.sqrt(sixsum[0]*sixsum[4]-sixsum[2]*sixsum[2])))) r2 = r.map(lambda (key, value): (key + '2', value * value)) #Save result outdata = r.union(r2) # I have thought about combining this data onto one node. It is safe since the output data is small. outdata.coalesce(1).saveAsTextFile(output)
def main(): keyspace = sys.argv[1] output = sys.argv[2] orderkeys = sys.argv[3:] cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) conf.set('spark.dynamicAllocation.maxExecutors', 20) conf.setAppName('tpch orders sql') sc = pyspark_cassandra.CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) df_for_orders(keyspace, sc, sqlContext) df_for_lineitem(keyspace, sc, sqlContext) df_for_part(keyspace, sc, sqlContext) df_orders = sqlContext.sql("""SELECT o.orderkey, o.totalprice, p.name FROM orders o JOIN lineitem l ON (o.orderkey = l.orderkey) JOIN part p ON (l.partkey = p.partkey) WHERE o.orderkey in (""" + ",".join(orderkeys) \ +")") rdd_orders = df_orders.rdd rdd_orders = rdd_orders.map(map_key) \ .reduceByKey(reduce_parts)\ .sortByKey() \ .map(map_output).coalesce(1) rdd_orders.saveAsTextFile(output)
def main(): keyspace = sys.argv[1] output = sys.argv[2] cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) conf.setAppName('correlate logs better in cassandra') sc = pyspark_cassandra.CassandraSparkContext(conf=conf) mapped_lines = rdd_for(keyspace, "nasalogs", sc, split_size=9000) mapped_tuples = mapped_lines.map(get_tuples) reduced_tuples = mapped_tuples.reduceByKey(add_tuples) reduced_tuples.cache() N = reduced_tuples.count() average_compute = reduced_tuples.map(lambda p: (p[1][0], p[1][1])) bytes_total, requests_total = average_compute.reduce(add_tuples) bytes_avg = 1.0 * bytes_total / N requests_avg = 1.0 * requests_total / N temp = reduced_tuples.map(lambda p: calculate(p, bytes_avg, requests_avg)) numerator, denom_1, denom_2 = temp.reduce(add_tuples) denom_1 = math.sqrt(denom_1) denom_2 = math.sqrt(denom_2) denom = denom_1 * denom_2 result = numerator / denom temp = ['r = %f' % (result), 'r^2 = %f' % (result**2)] sc.parallelize(temp, numSlices=1).saveAsTextFile(output)
def main(argv=None): if argv is None: inputs = sys.argv[1] keyspace = sys.argv[2] table = sys.argv[3] # initialize spark cassandra cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) sc = pyspark_cassandra.CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) #Process input file based on table_name if table == "yelp_review": df = sqlContext.read.format('com.databricks.spark.csv') \ .options(header='true').load(inputs) \ .select("user_id","review_id","votes_cool","business_id","votes_funny", \ "stars","date","votes_useful") rdd = df.rdd.map(lambda line: (line[0],line[1],int(line[2]), \ line[3],int(line[4]),int(line[5]),line[6], int(line[7]))) columns = ["user_id","review_id","votes_cool","business_id","votes_funny", \ "stars","date","votes_useful"] elif table == "yelp_business": df = sqlContext.read.format('com.databricks.spark.csv') \ .options(header='true').load(inputs) \ .select("business_id","name","review_count","state","full_address",\ "open","city","latitude","longitude","stars") rdd = df.rdd.map(lambda line: (line[0],line[1],int(line[2]),line[3], \ line[4],line[5],line[6],float(line[7]),float(line[8]),float(line[9]))) columns = ["business_id","name","review_count","state","full_address",\ "open","city","latitude","longitude","stars"] elif table == "yelp_business_checkin": df = sqlContext.read.format('com.databricks.spark.csv') \ .options(header='true').load(inputs) \ .select("business_id","day","hour","checkin") rdd = df.rdd.map(lambda line: (str(uuid.uuid1()), line[0], int(line[ 1]), int(line[2]), int(float(line[3])))) columns = ["id", "business_id", "day", "hour", "checkin"] elif table == "yelp_user": df = sqlContext.read.format('com.databricks.spark.csv') \ .options(header='true').load(inputs) \ .select("user_id","name","yelping_since") rdd = df.rdd.map(lambda line: (line[0], line[1], '12345678', line[2])) columns = ["user_id", "name", "password", "yelping_since"] #Save result to Cassandra rdd.saveToCassandra(keyspace, table, columns=columns, batch_size=300, parallelism_level=1000)
def main(): keyspace = sys.argv[1] output = sys.argv[2] orderkeys = sys.argv[3:] cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) conf.set('spark.dynamicAllocation.maxExecutors', 20) conf.setAppName('tpch orders denorm cassandra') sc = pyspark_cassandra.CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) orders_rdd = rdd_for_orders(keyspace, sc, sqlContext, orderkeys) orders_rdd = orders_rdd.map(map_output) orders_rdd.saveAsTextFile(output)
def main(argv=None): if argv is None: inputs = sys.argv[1] keyspace = sys.argv[2] # initialize spark cassandra cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) sc = pyspark_cassandra.CassandraSparkContext(conf=conf) #Get RDD from input file text = sc.textFile(inputs) linere = re.compile("^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$") request = text.map(lambda line: linere.split(line)) \ .filter(lambda lineSplit: len(lineSplit) > 4) \ .map(lambda lineSplit: (str(uuid.uuid1()), lineSplit[1],datetime.datetime.strptime(lineSplit[2], '%d/%b/%Y:%H:%M:%S').isoformat(),\ lineSplit[3], int(lineSplit[4]))) #Save result to Cassandra request.saveToCassandra(keyspace, 'nasalogs', columns=["id", "host", "datetime", "path", "bytes"], batch_size=300, parallelism_level=1000 )
def main(): #Defining input directory, keyspace and table name inputs = sys.argv[1] keyspace = sys.argv[2] #Cluster configuration cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) sc = pyspark_cassandra.CassandraSparkContext(conf=conf) #Reading the input file, and then matching the pattern file_data = sc.textFile(inputs) linere = re.compile("^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$") #Mapping the data after fetching the required values out of the Nasa Web server logs file KeyValue = file_data.map(lambda line : linere.split(line)).filter(lambda x : len(x)==6).map(lambda y : (y[1],(dt.datetime.strptime(y[2], '%d/%b/%Y:%H:%M:%S')),y[3],y[-2])).cache() #Mapping the KeyValue RDD as the required format of 4 columns Nasa = KeyValue.map(lambda p: {"host": p[0], "datetime": p[1], "path": p[2], "bytes": long(p[3])}) Nasa.saveToCassandra(keyspace, 'nasalogs')
def main(): input_keyspace = sys.argv[1] output_keyspace = sys.argv[2] cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) conf.set('spark.dynamicAllocation.maxExecutors', 20) conf.setAppName('tpch orders denormalize cassandra') sc = pyspark_cassandra.CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) df_for_part(input_keyspace, sc, sqlContext) df_lineitems = df_for_lineitem(input_keyspace, sc, sqlContext) df_part = sqlContext.sql("""SELECT l.orderkey, p.name FROM lineitem l JOIN part p ON (l.partkey = p.partkey)""") df_part_names = rows_to_list(df_part, "orderkey", "name", "names", sqlContext) df_part_names.registerTempTable("part_names") df_orders = df_for_orders(input_keyspace, sc, sqlContext) df_orders = sqlContext.sql("""SELECT o.*, p.names as part_names FROM orders o JOIN part_names p ON (o.orderkey = p.orderkey) """ ) rdd_orders = df_orders.rdd.map(map_primary_key) rdd_orders.saveToCassandra(output_keyspace, 'orders_parts', parallelism_level=64)
cdata = dict((f, v) for v, (f, t) in zip(data, types)) return cdata def read_table(keyspace, sc, input_dir, tbl, types): infile = os.path.join(input_dir, tbl + '.tbl.gz') cdata = sc.textFile(infile).repartition(100).map( lambda line: to_cassandra_data(line, types)).setName(tbl) cdata.saveToCassandra(keyspace, tbl, consistency_level=ConsistencyLevel.ONE) def read_tables(sc, input_dir, keyspace): for tbl, types, primarykey in tables: read_table(keyspace, sc, input_dir, tbl, types) if __name__ == '__main__': input_dir = sys.argv[1] keyspace = sys.argv[2] from pyspark import SparkConf import pyspark_cassandra conf = SparkConf().setAppName('TPC ingest') \ .set("spark.cassandra.connection.host", ','.join(cluster_seeds)) sc = pyspark_cassandra.CassandraSparkContext(conf=conf) read_tables(sc, input_dir, keyspace)
def main(): outdir = sys.argv[1] cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)).set( 'spark.dynamicAllocation.maxExecutors', 20) sc = pyspark_cassandra.CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # Loading yelp_review data yelp_review = sc.cassandraTable("bigbang", "yelp_review").select('user_id','business_id','stars')\ .map(lambda r: (r['user_id'],r['business_id'],r['stars'])).cache() user_id_lst = list(set(yelp_review.map(lambda r: r[0]).collect())) user_dict = map_string_to_int(user_id_lst) business_id_lst = list(set(yelp_review.map(lambda r: r[1]).collect())) business_dict = map_string_to_int(business_id_lst) ratings = yelp_review.map( lambda r: (user_dict[r[0]], business_dict[r[1]], r[2])).cache() # Splitting Ratings data into Training set, Validation set and Test set. training_RDD, validation_RDD, test_RDD = ratings.randomSplit([6, 2, 2], seed=0L) validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1])).cache() test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1])).cache() # Training Phase seed = 5L iterations = [10, 15, 20] regularization_parameters = [1.0, 0.1, 0.01] ranks = [12, 16, 20] min_error = float('inf') best_rank = -1 for rank in ranks: for iteration in iterations: for regularization_parameter in regularization_parameters: model = ALS.train(training_RDD, rank, iteration, lambda_=regularization_parameter, seed=seed) predictions = model.predictAll(validation_for_predict_RDD).map( lambda r: ((r[0], r[1]), r[2])) rates_and_preds = validation_RDD.map( lambda r: ((r[0], r[1]), r[2])).join(predictions) error = math.sqrt( rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) print 'For rank %s , iteration %s , regularization parameter %s the RMSE is %s' % ( rank, iteration, regularization_parameter, error) if error < min_error: min_error = error best_rank = rank best_iteration = iteration best_regularizer = regularization_parameter print 'The best model was trained with rank %s , iteration %s , regularization parameter %s' % ( best_rank, best_iteration, best_regularizer) # Test the selected model model = ALS.train(training_RDD, best_rank, best_iteration, lambda_=best_regularizer, seed=seed) predictions = model.predictAll(test_for_predict_RDD).map( lambda r: ((r[0], r[1]), r[2])) rates_and_preds = test_RDD.map( lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions) error = math.sqrt( rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) print 'For testing data the RMSE is %s' % (error) result = sc.parallelize([('best_rank', best_rank), ('best_iteration', best_iteration), ('best_regularizer', best_regularizer), ('train_error', min_error), ('test_error', error)]) outdata = result.map(lambda o_str: u"%s : %s" % (o_str[0], o_str[1])).coalesce(1) outdata.saveAsTextFile(outdir)
def clean_str(str): import re str = str + " " str = re.sub("http[^ ]*[\\\]", "\\\\", str) #Remove hyperlinks str = re.sub("http[^ ]* ", " ", str) #Remove hyperlinks str = str.replace('\\n', ' ') arr = re.findall( r"\w+(?:[-']\w+)*|'|[:)-.(]+|\S\w*", str) #Single punctuation mark is removed, smileys remain intact arr = [i for i in arr if len(i) > 1 and i[0] != '@' ] #Remove words starting with @ (Twitter mentions) arr = [i if i[0] != '#' else i[1:] for i in arr] #Remove '#' from hashtags #arr=[i for i in arr if i!='http' and i!='com' and i!='org'] res = " ".join(arr) return res.lower().strip() fp, out = sys.argv[1].split(',') sc = pyspark_cassandra.CassandraSparkContext() data = sc.textFile(fp, 36) clean_text = data.map(json.loads) \ .map(lambda x: (x, clean_str(x['text']))) json_preds = clean_text.map(lambda x: (x[0], predictTweet(x[1])['pos'])) \ .map(json.dumps) json_preds.saveAsTextFile(out)
def main(): in_path = sys.argv[1] #outdir = sys.argv[2] cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) sc = pyspark_cassandra.CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # Loading best parameters found in model_selection.py text = sc.textFile(in_path) text_processed = text.map(lambda line: line.split(":")).cache() best_rank = text_processed.filter(lambda w: str(w[0]).strip( ) == 'best_rank').map(lambda w: w[1]).collect() best_rank_int = int(best_rank[0]) best_iteration = text_processed.filter(lambda w: str(w[0]).strip() == 'best_iteration')\ .map(lambda w: w[1]).collect() best_iter_int = int(best_iteration[0]) best_regularizer = text_processed.filter(lambda w: str(w[0]).strip() == 'best_regularizer')\ .map(lambda w: w[1]).collect() best_reg_float = float(best_regularizer[0]) # Loading yelp_business data yelp_business = sc.cassandraTable("bigbang", "yelp_business").\ select('business_id', 'city', 'full_address', 'latitude', 'longitude', 'name', 'open', 'review_count', 'stars', 'state')\ .cache() # Loading yelp_review data yelp_review = sc.cassandraTable("bigbang", "yelp_review").select('user_id','business_id','stars')\ .map(lambda r: (r['user_id'],r['business_id'],r['stars'])).cache() # mapping string user_ids to int values user_id_lst = list(set(yelp_review.map(lambda r: r[0]).collect())) user_dict = map_string_to_int(user_id_lst) # mapping string business_ids to int values business_id_lst = list(set(yelp_review.map(lambda r: r[1]).collect())) business_dict = map_string_to_int(business_id_lst) #Converting the string user_id and business_id in ratings/review data into integer values using dictionaries ratings = yelp_review.map( lambda r: (user_dict[r[0]], business_dict[r[1]], r[2])).cache() # Train the model with the best parameters found in model_selection.py seed = 5L model = ALS.train(ratings, best_rank_int, best_iter_int, lambda_=best_reg_float, seed=seed) #Preparing distinct list of user_ids for predicting ratings for. ratings_df = ratings.toDF() ratings_df.registerTempTable('ratingsTable') user_distinct_df = sqlContext.sql( """SELECT distinct(_1) as user_id FROM ratingsTable LIMIT 10""") user_distinct_df.registerTempTable('userDistinctTable') # Preparing distinct list of business_ids for predicting ratings for. business_distinct_df = sqlContext.sql( """SELECT distinct(_2) as business_id FROM ratingsTable""") business_distinct_df.registerTempTable('businessDistinctTable') # Joining list of user_ids and business_ids. user_movie_for_pred_df = sqlContext.sql( """SELECT user_id, business_id from userDistinctTable , businessDistinctTable""" ) user_movie_for_pred_rdd = user_movie_for_pred_df.rdd.map( lambda p: (int(p[0]), int(p[1]))) #Predicting ratings for the user_id and business_id combination retrieved above. predictions = model.predictAll(user_movie_for_pred_rdd).map( lambda r: (r[0], r[1], r[2])).cache() # mapping int user_ids back to their string values user_dict = map_int_to_string(user_dict) # mapping int business_ids back to their string values business_dict = map_int_to_string(business_dict) #Saving output predicted ratings to cassandra table 'user_pred_ratings' preds = predictions.map(lambda p: (str(user_dict[p[0]]), str(business_dict[p[1]]), Decimal(p[2]).quantize(decimal.Decimal('1.00')))) print(preds.take(5)) #preds.saveAsTextFile(outdir) preds.saveToCassandra( 'bigbang', 'user_pred_ratings', columns=["user_id", "business_id", "user_pred_rating"], parallelism_level=1000)
def main(): cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) sc = pyspark_cassandra.CassandraSparkContext(conf=conf) sqlContext = HiveContext(sc) # Loading yelp_business data yelp_business = sc.cassandraTable("bigbang", "yelp_business").\ select('business_id', 'city', 'full_address', 'latitude', 'longitude', 'name', 'open', 'review_count', 'stars', 'state')\ .toDF() yelp_business.registerTempTable('business') # Loading yelp_user data yelp_user = sc.cassandraTable("bigbang", "yelp_user").select('user_id', 'name', 'yelping_since').toDF() yelp_user.registerTempTable('user') # Loading user_pred_ratings data user_pred_ratings = sc.cassandraTable("bigbang", "user_pred_ratings").\ select('user_id', 'business_id', 'user_pred_rating').toDF() user_pred_ratings.registerTempTable('userPredRatings') # Loading checkin data business_checkin = sc.cassandraTable("bigbang", "yelp_business_checkin")\ .select('business_id', 'checkin','day','hour').toDF() business_checkin.registerTempTable('businessCheckin') # Joining user, business and ratings to get denormalized data user_res_pred_denorm_df = sqlContext.sql( """SELECT a.user_id as user_id, b.city as city, a.business_id as business_id, b.full_address as full_address, b.latitude as latitude, b.longitude as longitude, b.name as business_name, b.open as open, b.review_count as review_count, b.stars as stars, b.state as state, c.name as user_name, a.user_pred_rating as user_pred_rating from userPredRatings a inner join business b on a.business_id = b.business_id inner join user c on a.user_id = c.user_id""") # user_res_pred_denorm_df.registerTempTable('userResPredDenorm') # # recommendations_top10_df = sqlContext.sql( # """SELECT user_id, city, business_id, full_address, latitude, longitude, business_name, open, review_count, # stars, state, user_name, user_pred_rating FROM # (SELECT user_id, city, business_id, full_address, latitude, longitude, business_name, open, review_count, stars, # state, user_name, user_pred_rating, row_number() OVER (PARTITION BY user_id, state ORDER BY user_pred_rating desc) rn # FROM userResPredDenorm) temp # WHERE temp.rn <= 20""") #Retrieving top 20 restaurants per user per state using Window Analytic Functions windowSpec = Window.partitionBy(user_res_pred_denorm_df['user_id'],user_res_pred_denorm_df['state']).\ orderBy(user_res_pred_denorm_df['user_pred_rating'].desc()) rn_business = (func.rowNumber().over(windowSpec)) recommendations_ranked_df = user_res_pred_denorm_df.select( user_res_pred_denorm_df['user_id'], user_res_pred_denorm_df['city'], user_res_pred_denorm_df['business_id'], user_res_pred_denorm_df['full_address'], user_res_pred_denorm_df['latitude'], user_res_pred_denorm_df['longitude'], user_res_pred_denorm_df['business_name'], user_res_pred_denorm_df['open'], user_res_pred_denorm_df['review_count'], user_res_pred_denorm_df['stars'], user_res_pred_denorm_df['state'], user_res_pred_denorm_df['user_name'], user_res_pred_denorm_df['user_pred_rating'], rn_business.alias("rn")) recommendations_top20_df = recommendations_ranked_df.filter( recommendations_ranked_df["rn"] <= 20).select( recommendations_ranked_df['user_id'], recommendations_ranked_df['city'], recommendations_ranked_df['business_id'], recommendations_ranked_df['full_address'], recommendations_ranked_df['latitude'], recommendations_ranked_df['longitude'], recommendations_ranked_df['business_name'], recommendations_ranked_df['open'], recommendations_ranked_df['review_count'], recommendations_ranked_df['stars'], recommendations_ranked_df['state'], recommendations_ranked_df['user_name'], recommendations_ranked_df['user_pred_rating']) recommendations_top20_df.registerTempTable('recommendations') recommendations_top20_rdd = recommendations_top20_df.rdd.map(lambda p: ( str(uuid.uuid1()), p['user_id'], p['city'], p['business_id'], p[ 'full_address'], p['latitude'], p['longitude'], p[ 'business_name'], p['open'], p['review_count'], p['stars'], p[ 'state'], p['user_name'], p['user_pred_rating'])) # Loading the recommendations table in cassandra recommendations_top20_rdd.saveToCassandra( 'bigbang', 'recommendations', columns=[ "uid", "user_id", "business_city", "business_id", "business_full_address", "business_latitude", "business_longitude", "business_name", "business_open", "business_review_count", "business_stars", "business_state", "user_name", "user_pred_rating" ], parallelism_level=1000) # Joining recommendations and checkin data user_res_checkin_denorm_df = sqlContext.sql( """SELECT a.user_id as user_id, a.city as business_city, a.business_id as business_id, a.full_address as business_full_address, a.latitude as business_latitude, a.longitude as business_longitude, a.business_name as business_name, a.open as business_open, a.review_count as business_review_count, a.stars as business_stars, a.state as business_state, a.user_name as user_name, a.user_pred_rating as user_pred_rating, b.day as day, b.hour as hour, b.checkin as checkin from recommendations a inner join businessCheckin b on a.business_id = b.business_id""") recommendations_checkin_rdd = user_res_checkin_denorm_df.rdd.map( lambda p: (str(uuid.uuid1()), p['user_id'], p['business_city'], p[ 'business_id'], p['business_full_address'], p['business_latitude'], p['business_longitude'], p['business_name'], p[ 'business_open'], p['business_review_count'], p['business_stars'], p['business_state'], p['user_name'], p[ 'user_pred_rating'], p['day'], p['hour'], p['checkin'])) # Loading the recommendations_checkin table in cassandra recommendations_checkin_rdd.saveToCassandra( 'bigbang', 'recommendations_checkin', columns=[ "uid", "user_id", "business_city", "business_id", "business_full_address", "business_latitude", "business_longitude", "business_name", "business_open", "business_review_count", "business_stars", "business_state", "user_name", "user_pred_rating", "day", "hour", "checkin" ], parallelism_level=1000)