def create_plays_by_user_filtered(plays_by_user, most_popular_songs, target_collection, disable_filtering=False): plays_batch = [] most_popular_song_ids = map(lambda s: int(s['_id']), most_popular_songs) for user_play_map in plays_by_user.find(): entry = {'_id': user_play_map['_id'], 'value': {}} for song_id, play_count in user_play_map['value'].iteritems(): song_index = int(song_id) if song_index in most_popular_song_ids: entry['value'][song_id] = play_count if disable_filtering or len(entry['value'].keys()) > 1: plays_batch.append(entry) if len(plays_batch) > 0 and len(plays_batch) % BATCH_SIZE == 0: insert_batch(plays_batch, target_collection) plays_batch = [] if len(plays_batch) > 0: insert_batch(plays_batch, target_collection)
def create_plays_by_most_often_played_song(db, source_collection_name, target_collection_name): buckets = {} i = 1 for plays_by_user in db[source_collection_name].find(): if i == 1 or i % 100 == 0: print "Looking for bucket for %d. user" % i user_id = plays_by_user['_id'] plays = plays_by_user['value'] most_often_played_song = plays[0] if most_often_played_song in buckets: bucket = buckets[most_often_played_song] else: bucket = [] buckets[most_often_played_song] = bucket bucket.append({'_id': user_id, 'value': plays}) i += 1 print "Number of buckets: %d" % len(buckets) target_collection = db[target_collection_name] batch = [{'_id': bucket_id, 'value': value} for bucket_id, value in buckets.iteritems()] print "Buckets batch ready" start = 0 end = BATCH_SIZE while start < len(batch): insert_batch(batch[start:end], target_collection) start += BATCH_SIZE end += BATCH_SIZE
def import_triplets_from_file(filename, db_collection): with open(filename) as file: triplets_batch = [] for line in file: triplet = get_triplet_from_fileline(line) triplets_batch.append(triplet) if len(triplets_batch) % BATCH_SIZE == 0: insert_batch(triplets_batch, db_collection) triplets_batch = [] if len(triplets_batch) > 0: insert_batch(triplets_batch, db_collection)
def create_plays_by_user_binary(plays_by_user, most_popular_songs, target_collection): plays_batch = [] for user_play_map in plays_by_user.find(): entry = {'_id': user_play_map['_id'], 'value': {}} for song in most_popular_songs: song_index = str(int(song['_id'])) entry['value'][song_index] = song_index in user_play_map['value'] plays_batch.append(entry) if len(plays_batch) % BATCH_SIZE == 0: insert_batch(plays_batch, target_collection) plays_batch = [] if len(plays_batch) > 0: insert_batch(plays_batch, target_collection)
def create_plays_by_user_filtered_simple(plays_by_user, target_collection): plays_batch = [] for plays_by_user in plays_by_user.find(): entry = { '_id': plays_by_user['_id'], 'value': [song_id for song_id in plays_by_user['value']] } # sort by plays count entry['value'].sort(key=lambda x: plays_by_user['value'][x], reverse=True) if len(entry['value']) > 1: plays_batch.append(entry) if len(plays_batch) > 0 and len(plays_batch) % BATCH_SIZE == 0: insert_batch(plays_batch, target_collection) plays_batch = [] if len(plays_batch) > 0: insert_batch(plays_batch, target_collection)
def create_plays_by_most_often_played_song(db, source_collection_name, target_collection_name): buckets = {} i = 1 for plays_by_user in db[source_collection_name].find(): if i == 1 or i % 100 == 0: print "Looking for bucket for %d. user" % i user_id = plays_by_user['_id'] plays = plays_by_user['value'] most_often_played_song = plays[0] if most_often_played_song in buckets: bucket = buckets[most_often_played_song] else: bucket = [] buckets[most_often_played_song] = bucket bucket.append({'_id': user_id, 'value': plays}) i += 1 print "Number of buckets: %d" % len(buckets) target_collection = db[target_collection_name] batch = [{ '_id': bucket_id, 'value': value } for bucket_id, value in buckets.iteritems()] print "Buckets batch ready" start = 0 end = BATCH_SIZE while start < len(batch): insert_batch(batch[start:end], target_collection) start += BATCH_SIZE end += BATCH_SIZE