mongo_client= MongoClient() mongo_client.drop_database(db_out) mongo_client.close() print 'database cleared' col_tenant_id = 1 col_user_id = 2 col_item_id = 3 num_to_recomm_per_user = 10 num_to_recomm_per_item = 10 conf = SparkConf().setAppName("PysparkCollaborativeFiltering").set("spark.cassandra.connection.host", spark_cassandra_connection_host) sc = CassandraSparkContext(conf=conf) sc.setCheckpointDir('checkpoint/') data = sc.cassandraTable("mykeyspace", "transactions",row_format=1).collect() # row_format: tuple # (id, tenant_id, user_id, item_id) tenant_ids = set(list(map(lambda x:x[col_tenant_id],data))) data_rdd = sc.parallelize(data) # data_rdd = sc.parallelize(data).map(list) all_results_per_user = sc.emptyRDD() all_results_per_item = sc.emptyRDD() for t_id in tenant_ids: print("\nComputing recommendation for tenant {}...\n".format(t_id)) per_tenant_rdd = data_rdd.filter( lambda x: x[col_tenant_id] == t_id).map( lambda l: ((l[col_user_id],l[col_item_id]),1.0)).reduceByKey( lambda x,y: x + y).map(