예제 #1
0
def produce_song_pairs(song_list):
	song_pairs = combinations(song_list, 2)
	song_pairs_list = map(lambda song_pair: (song_pair, 1), song_pairs)
	return song_pairs_list

def cassandra_row_format(song_pair):
	songs = song_pair[0]
	frequency = song_pair[1]
	return [{"song_id": int(songs[0]), "freq_song_id": int(songs[1]), "frequency": frequency}, {"song_id": int(songs[1]), "freq_song_id": int(songs[0]), "frequency": frequency}]


if __name__ == "__main__":
	conf = SparkConf().setAppName("FrequentPatternsSongs").setMaster(config.SPARK_MASTER).set("spark.cassandra.connection.host", config.CASSANDRA_SEED_NODE_IP)
	sc = CassandraSparkContext(conf=conf)
	frequency_threshold = 3

	filename = datetime.now().strftime("%Y-%m-%d")+"-usersonglog.txt"

	sc.textFile(config.HDFS_URL+":"+config.HDFS_PORT+config.LOG_FOLDER+filename) \
		.filter(time_range_filter) \
		.map(parse_log_entry) \
		.reduceByKey(lambda song1, song2: song1+song2) \
		.map(lambda x: sorted(set(x[1]))) \
		.flatMap(produce_song_pairs) \
		.reduceByKey(lambda a,b: a+b) \
		.filter(lambda song_pair: song_pair[1] > frequency_threshold) \
		.flatMap(cassandra_row_format) \
		.saveToCassandra(config.CASSANDRA_KEYSPACE, "frequent_song_pairs")

	sc.close()
예제 #2
0
def parse_log_entry(line):
	val = line.split(",")
	if len(val) < 3:
		return None
	return (str(val[1]), [str(val[2])])

if __name__ == "__main__":
	conf = SparkConf().setAppName("UserUserRelevance").setMaster(config.SPARK_MASTER).set("spark.cassandra.connection.host", config.CASSANDRA_SEED_NODE_IP)
	sc = CassandraSparkContext(conf=conf)
	
	filename = datetime.now().strftime("%Y-%m-%d")+"-usersonglog.txt"

	users = sc.textFile(config.HDFS_URL+":"+config.HDFS_PORT+config.LOG_FOLDER+filename) \
						.filter(time_range_filter) \
						.map(parse_log_entry) \
						.keys() \
						.collect()

	song_map = {} # store song to user mapping for use in later stages

	usersongdb = sc.cassandraTable(config.CASSANDRA_KEYSPACE, "user_to_song")
	songuserdb = sc.cassandraTable(config.CASSANDRA_KEYSPACE, "song_to_user")

	for user in users:
		user_suggest = []
		song_list = usersongdb.select("song_id") \
				.where("user_id=? and req_time > ? and req_time < ?", int(user), five_weeks_back, now+1) \
				.map(lambda row: row.song_id) \
				.distinct() \
				.collect()
            formatted_rdd_per_user = recomm_per_user.map(lambda row: (t_id,row[0],row[1]))
            all_results_per_user = all_results_per_user.union(formatted_rdd_per_user)
            formatted_rdd_per_item = recomm_per_item.map(lambda row: (t_id,row[0],row[1]))
            all_results_per_item = all_results_per_item.union(formatted_rdd_per_item)
                
        if SAVE_DATA_TO_DB:
            save_to_mongo(all_results_per_user,dbtable_out_per_user)
            save_to_mongo(all_results_per_item,dbtable_out_per_item)
        else:
            print("%d recommendations per user:"******"%d recommendations per item:" % num_to_recomm_per_item)
            print(all_results_per_user.collect())
    else:
        data_rdd = sc.textFile(test_file).map(lambda l: l.split(','))
        recomm_per_user,recomm_per_item = test(data_rdd, num_to_recomm_per_user,num_to_recomm_per_item)

        if SAVE_DATA_TO_DB:
            save_to_mongo(recomm_per_user,dbtable_out_per_user)
            save_to_mongo(recomm_per_item,dbtable_out_per_item)
        else:
            print("%d recommendations per user:"******"%d recommendations per item:" % num_to_recomm_per_item)
            print(recomm_per_item.collect())
            
    elapsed = (time.time() - t0)
    sc.stop()
    print ("\nIt took %.2fsec to complete" % elapsed) 
            formatted_rdd_per_user = recomm_per_user.map(lambda row: (t_id,row[0],row[1]))
            all_results_per_user = all_results_per_user.union(formatted_rdd_per_user)
            formatted_rdd_per_item = recomm_per_item.map(lambda row: (t_id,row[0],row[1]))
            all_results_per_item = all_results_per_item.union(formatted_rdd_per_item)
                
        if SAVE_DATA_TO_DB:
            save_to_mongo(all_results_per_user,dbtable_out_per_user)
            save_to_mongo(all_results_per_item,dbtable_out_per_item)
        else:
            print("%d recommendations per user:"******"%d recommendations per item:" % num_to_recomm_per_item)
            print(all_results_per_user.collect())
    else:
        print ('loading...')
        datafile = sc.textFile("data2.csv").map(lambda l: l.split(','))
        recomm_per_user,recomm_per_item = recomm_cf.TrainAndComputeRecommendation(sc, datafile,
                                                                        num_to_recomm_per_user,
                                                                        num_to_recomm_per_item)

        if SAVE_DATA_TO_DB:
            save_to_mongo(recomm_per_user,dbtable_out_per_user)
            save_to_mongo(recomm_per_item,dbtable_out_per_item)
        else:
            print("%d recommendations per user:"******"%d recommendations per item:" % num_to_recomm_per_item)
            print(recomm_per_item.collect())
            
    elapsed = (time.time() - t0)
    sc.stop()