def create_plays_by_user_filtered(plays_by_user,
                                  most_popular_songs,
                                  target_collection,
                                  disable_filtering=False):
    plays_batch = []

    most_popular_song_ids = map(lambda s: int(s['_id']), most_popular_songs)

    for user_play_map in plays_by_user.find():
        entry = {'_id': user_play_map['_id'], 'value': {}}

        for song_id, play_count in user_play_map['value'].iteritems():
            song_index = int(song_id)
            if song_index in most_popular_song_ids:
                entry['value'][song_id] = play_count

        if disable_filtering or len(entry['value'].keys()) > 1:
            plays_batch.append(entry)

        if len(plays_batch) > 0 and len(plays_batch) % BATCH_SIZE == 0:
            insert_batch(plays_batch, target_collection)
            plays_batch = []

    if len(plays_batch) > 0:
        insert_batch(plays_batch, target_collection)
def create_plays_by_most_often_played_song(db, source_collection_name, target_collection_name):
    buckets = {}

    i = 1
    for plays_by_user in db[source_collection_name].find():
        if i == 1 or i % 100 == 0:
            print "Looking for bucket for %d. user" % i

        user_id = plays_by_user['_id']
        plays = plays_by_user['value']

        most_often_played_song = plays[0]

        if most_often_played_song in buckets:
            bucket = buckets[most_often_played_song]
        else:
            bucket = []
            buckets[most_often_played_song] = bucket

        bucket.append({'_id': user_id, 'value': plays})
        i += 1

    print "Number of buckets: %d" % len(buckets)

    target_collection = db[target_collection_name]
    batch = [{'_id': bucket_id, 'value': value} for bucket_id, value in buckets.iteritems()]
    print "Buckets batch ready"

    start = 0
    end = BATCH_SIZE
    while start < len(batch):
        insert_batch(batch[start:end], target_collection)
        start += BATCH_SIZE
        end += BATCH_SIZE
def import_triplets_from_file(filename, db_collection):
    with open(filename) as file:
        triplets_batch = []
        for line in file:
            triplet = get_triplet_from_fileline(line)
            triplets_batch.append(triplet)

            if len(triplets_batch) % BATCH_SIZE == 0:
                insert_batch(triplets_batch, db_collection)
                triplets_batch = []

        if len(triplets_batch) > 0:
            insert_batch(triplets_batch, db_collection)
def create_plays_by_user_binary(plays_by_user, most_popular_songs, target_collection):
    plays_batch = []

    for user_play_map in plays_by_user.find():
        entry = {'_id': user_play_map['_id'], 'value': {}}
        for song in most_popular_songs:
            song_index = str(int(song['_id']))
            entry['value'][song_index] = song_index in user_play_map['value']

        plays_batch.append(entry)

        if len(plays_batch) % BATCH_SIZE == 0:
            insert_batch(plays_batch, target_collection)
            plays_batch = []

    if len(plays_batch) > 0:
        insert_batch(plays_batch, target_collection)
예제 #5
0
def create_plays_by_user_binary(plays_by_user, most_popular_songs,
                                target_collection):
    plays_batch = []

    for user_play_map in plays_by_user.find():
        entry = {'_id': user_play_map['_id'], 'value': {}}
        for song in most_popular_songs:
            song_index = str(int(song['_id']))
            entry['value'][song_index] = song_index in user_play_map['value']

        plays_batch.append(entry)

        if len(plays_batch) % BATCH_SIZE == 0:
            insert_batch(plays_batch, target_collection)
            plays_batch = []

    if len(plays_batch) > 0:
        insert_batch(plays_batch, target_collection)
def create_plays_by_user_filtered_simple(plays_by_user, target_collection):
    plays_batch = []

    for plays_by_user in plays_by_user.find():
        entry = {
            '_id': plays_by_user['_id'],
            'value': [song_id for song_id in plays_by_user['value']]
        }

        # sort by plays count
        entry['value'].sort(key=lambda x: plays_by_user['value'][x], reverse=True)

        if len(entry['value']) > 1:
            plays_batch.append(entry)

        if len(plays_batch) > 0 and len(plays_batch) % BATCH_SIZE == 0:
            insert_batch(plays_batch, target_collection)
            plays_batch = []

    if len(plays_batch) > 0:
        insert_batch(plays_batch, target_collection)
def create_plays_by_user_filtered_simple(plays_by_user, target_collection):
    plays_batch = []

    for plays_by_user in plays_by_user.find():
        entry = {
            '_id': plays_by_user['_id'],
            'value': [song_id for song_id in plays_by_user['value']]
        }

        # sort by plays count
        entry['value'].sort(key=lambda x: plays_by_user['value'][x],
                            reverse=True)

        if len(entry['value']) > 1:
            plays_batch.append(entry)

        if len(plays_batch) > 0 and len(plays_batch) % BATCH_SIZE == 0:
            insert_batch(plays_batch, target_collection)
            plays_batch = []

    if len(plays_batch) > 0:
        insert_batch(plays_batch, target_collection)
def create_plays_by_most_often_played_song(db, source_collection_name,
                                           target_collection_name):
    buckets = {}

    i = 1
    for plays_by_user in db[source_collection_name].find():
        if i == 1 or i % 100 == 0:
            print "Looking for bucket for %d. user" % i

        user_id = plays_by_user['_id']
        plays = plays_by_user['value']

        most_often_played_song = plays[0]

        if most_often_played_song in buckets:
            bucket = buckets[most_often_played_song]
        else:
            bucket = []
            buckets[most_often_played_song] = bucket

        bucket.append({'_id': user_id, 'value': plays})
        i += 1

    print "Number of buckets: %d" % len(buckets)

    target_collection = db[target_collection_name]
    batch = [{
        '_id': bucket_id,
        'value': value
    } for bucket_id, value in buckets.iteritems()]
    print "Buckets batch ready"

    start = 0
    end = BATCH_SIZE
    while start < len(batch):
        insert_batch(batch[start:end], target_collection)
        start += BATCH_SIZE
        end += BATCH_SIZE