def produce_song_pairs(song_list): song_pairs = combinations(song_list, 2) song_pairs_list = map(lambda song_pair: (song_pair, 1), song_pairs) return song_pairs_list def cassandra_row_format(song_pair): songs = song_pair[0] frequency = song_pair[1] return [{"song_id": int(songs[0]), "freq_song_id": int(songs[1]), "frequency": frequency}, {"song_id": int(songs[1]), "freq_song_id": int(songs[0]), "frequency": frequency}] if __name__ == "__main__": conf = SparkConf().setAppName("FrequentPatternsSongs").setMaster(config.SPARK_MASTER).set("spark.cassandra.connection.host", config.CASSANDRA_SEED_NODE_IP) sc = CassandraSparkContext(conf=conf) frequency_threshold = 3 filename = datetime.now().strftime("%Y-%m-%d")+"-usersonglog.txt" sc.textFile(config.HDFS_URL+":"+config.HDFS_PORT+config.LOG_FOLDER+filename) \ .filter(time_range_filter) \ .map(parse_log_entry) \ .reduceByKey(lambda song1, song2: song1+song2) \ .map(lambda x: sorted(set(x[1]))) \ .flatMap(produce_song_pairs) \ .reduceByKey(lambda a,b: a+b) \ .filter(lambda song_pair: song_pair[1] > frequency_threshold) \ .flatMap(cassandra_row_format) \ .saveToCassandra(config.CASSANDRA_KEYSPACE, "frequent_song_pairs") sc.close()