Exemplo n.º 1
0
    def find_for_users(self, users_plays):
        scores = []

        current_user_number = 1
        for user_plays in users_plays:
            if current_user_number == 1 or current_user_number % 100 == 0:
                print 'Looking for knn for %s. user' % current_user_number
            scores.append({'user_id': user_plays['_id'], 'scores': self.find_for_user(user_plays)})

            if current_user_number % 10000 == 0:
                invoke_measurable_task(lambda: self.write_scores_to_file(scores), "Writing %d scores to file..." % len(scores))
                scores = []

            current_user_number += 1

        invoke_measurable_task(lambda: self.write_scores_to_file(scores), "Writing %d scores to file..." % len(scores))
            for score in scores[0]['scores']:
                file.write(' ' + str(score['user_id']))

        return scores


with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local

    plays_for_all_users = [1]

    def load_plays_for_all_users():
        plays_for_all_users[0] = list(db.plays_by_user_filtered_t.find())

    invoke_measurable_task(load_plays_for_all_users,
                           'Load plays for all users')
    plays_for_all_users = plays_for_all_users[0]

    plays_for_validated_users = [1]

    def load_plays_for_all_users():
        plays_for_validated_users[0] = list(
            db.plays_by_user_filtered_v.find().limit(NUMBER_OF_USERS))

    invoke_measurable_task(load_plays_for_all_users,
                           'Load plays for validated users')
    plays_for_validated_users = plays_for_validated_users[0]

    knn = JaccardBasedKnn(plays_for_all_users, KNN_K)
    invoke_measurable_task(
        lambda: knn.find_for_users(plays_for_validated_users),
            insert_batch(plays_batch, target_collection)
            plays_batch = []

    if len(plays_batch) > 0:
        insert_batch(plays_batch, target_collection)


with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local

    most_popular_songs = [1]

    def load_most_popular_songs():
        most_popular_songs[0] = list(db.play_count_by_song_t.find().sort(
            'value', DESCENDING).limit(NUMBER_OF_MOST_POPULAR_SONGS))

    invoke_measurable_task(
        load_most_popular_songs,
        'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS)
    most_popular_songs = most_popular_songs[0]

    invoke_measurable_task(
        lambda: create_plays_by_user_filtered(
            db.plays_by_user_t, most_popular_songs, db.plays_by_user_filtered_t
        ), "Create plays_by_user_filtered collection for train set")

    invoke_measurable_task(
        lambda:
        create_plays_by_user_filtered(db.plays_by_user_v, most_popular_songs,
                                      db.plays_by_user_filtered_v, True),
        "Create plays_by_user_filtered collection for validation set")
        for line in knn_results:
            print i
            knn_map.append(get_data_from_fileline(line))
            i += 1

with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local
    most_popular_songs = [1]

    def load_most_popular_songs():
        most_popular_songs[0] = list(
            db.play_count_by_song_t.find()
            .sort('value', DESCENDING)
            .limit(NUMBER_OF_MOST_POPULAR_SONGS)
        )
    invoke_measurable_task(load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS)
    most_popular_songs = most_popular_songs[0]

    load_knn_results()

    with open(KAGGLE_USERS_MAPPING_FILE_PATH) as kaggle_users:
        with open(KNN_CONVERTED_PATH, 'w') as my_file:
            my_file.write('Id,Expected\n')
            j = 1
            for index, user_id in enumerate(kaggle_users):
                my_file.write(user_id.strip() + ',')
                result = knn_map[index]
                index += 1

                predicted_songs = 0
                for song in result[1]:
    print 'Query created'
    songs_group = plays_by_user_binary_t.aggregate(pipeline)

with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local

    most_popular_songs = [1]

    def load_most_popular_songs():
        most_popular_songs[0] = list(
            db.play_count_by_song_t.find()
            .sort('value', DESCENDING)
            .limit(NUMBER_OF_MOST_POPULAR_SONGS)
        )

    invoke_measurable_task(load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS)
    most_popular_songs = most_popular_songs[0]
    print NUMBER_OF_MOST_POPULAR_SONGS, ' most popular songs selected\n'

    invoke_measurable_task(
        lambda: one_by_one(most_popular_songs, db.plays_by_user_binary_t),
        'Group by most popular song one by one')

    invoke_measurable_task(
        lambda: pairs(most_popular_songs, db.plays_by_user_binary_t),
        'Group by pairs of most popular songs')

    invoke_measurable_task(load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_SONGS_AT_ONE)
    most_popular_songs = most_popular_songs[0]
    print NUMBER_OF_SONGS_AT_ONE, 'most popular songs selected\n'
def plays_by_user_filtered():
    os.system('python create_plays_by_user_filtered.py')


def delete_map_reduce_contetn():
    folder = './map_reduce/'
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception, e:
            print e
    try:
        os.remove('./map_reduce/created_collections.txt')
    except OSError:
        pass


print 'INITIALIZING DATABASE'
invoke_measurable_task(lambda: import_triplets_to_mongo_db(),
                       'Import triplets to mongo db')
invoke_measurable_task(lambda: create_map_reduce_collections(),
                       'Create Map Reduce collections')
invoke_measurable_task(lambda: plays_by_user_simple(),
                       'Create collection plays_by_user_simple')
invoke_measurable_task(lambda: plays_by_user_binary(),
                       'Create collection plays_by_user_binary')
invoke_measurable_task(lambda: plays_by_user_filtered(),
                       'Create collection plays_by_user_filtered')
def get_triplet_from_fileline(line):
    parts = line.strip().split()

    user_id = parts[0]

    if user_id in users_map:
        user_index = users_map[parts[0]]
    else:
        last_index[0] += 1
        user_index = last_index[0]
        users_map[user_id] = user_index

    return {
        'user_index': user_index,
        'song_index': songs_map[parts[1]],
        'play_count': int(parts[2])
    }


invoke_measurable_task(load_kaggle_users_mapping, 'Load Kaggle users mapping')
invoke_measurable_task(load_kaggle_songs_mapping, 'Load Kaggle songs mapping')

with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local
    invoke_measurable_task(
        lambda: import_triplets_from_file(TRAIN_TRIPLETS_FILE_PATH, db.triplets_t),
        'Import train triplets')
    invoke_measurable_task(
        lambda: import_triplets_from_file(TEST_TRIPLETS_FILE_PATH, db.triplets_v),
        'Import validation triplets')
Exemplo n.º 8
0
        reduce = reduce_file.read()

    finalize_path = '%s/finalize.js' % directory
    if exists(finalize_path):
        with open(finalize_path) as finalize_file:
            finalize = finalize_file.read()
    else:
        finalize = None

    collection_path = '%s/source_collection.txt' % directory
    if exists(collection_path):
        with open(collection_path) as collection_file:
            source_collection_name = collection_file.read()
    else:
        source_collection_name = 'triplets'

    with MongoClient('localhost', MONGODB_PORT) as client:
        db = client.local

        invoke_measurable_task(
            lambda: db[source_collection_name + '_t'].map_reduce(
                map, reduce, directory + '_t', finalize=finalize),
            "Create map reduce collection %s for train set" % directory)

        invoke_measurable_task(
            lambda: db[source_collection_name + '_v'].map_reduce(
                map, reduce, directory + '_v', finalize=finalize),
            "Create map reduce collection %s for validation set" % directory)

    with open(CREATED_COLLECTIONS_FILE_PATH, 'a') as file:
        file.write('%s\n' % directory)
        current_user_number = 1
        for user_play in users_plays:
            print 'Looking for knn for %s. user' % current_user_number
            scores.append({'user_id': user_play['_id'], 'scores': self.find_for_user(set(user_play['value']))})
            current_user_number += 1

        with open('../results.txt', 'w') as file:
            file.write(str(scores[0]['user_id']))

            for score in scores[0]['scores']:
                file.write(' ' + str(score['user_id']))

        return scores


with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local

    plays_for_all_users = [1]
    def load_plays_for_all_users(): plays_for_all_users[0] = list(db.plays_by_user_filtered_t.find())
    invoke_measurable_task(load_plays_for_all_users, 'Load plays for all users')
    plays_for_all_users = plays_for_all_users[0]

    plays_for_validated_users = [1]
    def load_plays_for_all_users(): plays_for_validated_users[0] = list(db.plays_by_user_filtered_v.find().limit(NUMBER_OF_USERS))
    invoke_measurable_task(load_plays_for_all_users, 'Load plays for validated users')
    plays_for_validated_users = plays_for_validated_users[0]

    knn = JaccardBasedKnn(plays_for_all_users, KNN_K)
    invoke_measurable_task(lambda: knn.find_for_users(plays_for_validated_users),
                           'Find knn for %d users' % NUMBER_OF_USERS)
        most_often_played_song = plays[0]

        if most_often_played_song in buckets:
            bucket = buckets[most_often_played_song]
        else:
            bucket = []
            buckets[most_often_played_song] = bucket

        bucket.append({'_id': user_id, 'value': plays})
        i += 1

    print "Number of buckets: %d" % len(buckets)

    target_collection = db[target_collection_name]
    batch = [{'_id': bucket_id, 'value': value} for bucket_id, value in buckets.iteritems()]
    print "Buckets batch ready"

    start = 0
    end = BATCH_SIZE
    while start < len(batch):
        insert_batch(batch[start:end], target_collection)
        start += BATCH_SIZE
        end += BATCH_SIZE


with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local

    invoke_measurable_task(
        lambda: create_plays_by_most_often_played_song(db, 'plays_by_user_simple_t', 'plays_by_most_often_played_song_t'),
        'Create plays_by_most_often_played_song collection for train set')
        else:
            bucket = []
            buckets[most_often_played_song] = bucket

        bucket.append({'_id': user_id, 'value': plays})
        i += 1

    print "Number of buckets: %d" % len(buckets)

    target_collection = db[target_collection_name]
    batch = [{
        '_id': bucket_id,
        'value': value
    } for bucket_id, value in buckets.iteritems()]
    print "Buckets batch ready"

    start = 0
    end = BATCH_SIZE
    while start < len(batch):
        insert_batch(batch[start:end], target_collection)
        start += BATCH_SIZE
        end += BATCH_SIZE


with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local

    invoke_measurable_task(
        lambda: create_plays_by_most_often_played_song(
            db, 'plays_by_user_simple_t', 'plays_by_most_often_played_song_t'),
        'Create plays_by_most_often_played_song collection for train set')
    for plays_by_user in plays_by_user.find():
        entry = {
            '_id': plays_by_user['_id'],
            'value': [song_id for song_id in plays_by_user['value']]
        }

        # sort by plays count
        entry['value'].sort(key=lambda x: plays_by_user['value'][x], reverse=True)

        if len(entry['value']) > 1:
            plays_batch.append(entry)

        if len(plays_batch) > 0 and len(plays_batch) % BATCH_SIZE == 0:
            insert_batch(plays_batch, target_collection)
            plays_batch = []

    if len(plays_batch) > 0:
        insert_batch(plays_batch, target_collection)


with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local

    invoke_measurable_task(
        lambda: create_plays_by_user_filtered_simple(db.plays_by_user_t, db.plays_by_user_simple_t),
        "Create plays_by_user_simple_filtered collection for train set")

    invoke_measurable_task(
        lambda: create_plays_by_user_filtered_simple(db.plays_by_user_v, db.plays_by_user_simple_v),
        "Create plays_by_user_simple_filtered collection for validation set")
    print 'Query created'
    songs_group = plays_by_user_binary_t.aggregate(pipeline)


with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local

    most_popular_songs = [1]

    def load_most_popular_songs():
        most_popular_songs[0] = list(db.play_count_by_song_t.find().sort(
            'value', DESCENDING).limit(NUMBER_OF_MOST_POPULAR_SONGS))

    invoke_measurable_task(
        load_most_popular_songs,
        'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS)
    most_popular_songs = most_popular_songs[0]
    print NUMBER_OF_MOST_POPULAR_SONGS, ' most popular songs selected\n'

    invoke_measurable_task(
        lambda: one_by_one(most_popular_songs, db.plays_by_user_binary_t),
        'Group by most popular song one by one')

    invoke_measurable_task(
        lambda: pairs(most_popular_songs, db.plays_by_user_binary_t),
        'Group by pairs of most popular songs')

    invoke_measurable_task(
        load_most_popular_songs,
        'Load %d most popular songs' % NUMBER_OF_SONGS_AT_ONE)

def plays_by_user_filtered():
    os.system('python create_plays_by_user_filtered.py')


def delete_map_reduce_contetn():
    folder = './map_reduce/'
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception, e:
            print e
    try:
        os.remove('./map_reduce/created_collections.txt')
    except OSError:
        pass

print 'INITIALIZING DATABASE'
invoke_measurable_task(
    lambda: import_triplets_to_mongo_db(), 'Import triplets to mongo db')
invoke_measurable_task(
    lambda: create_map_reduce_collections(), 'Create Map Reduce collections')
invoke_measurable_task(
    lambda: plays_by_user_simple(), 'Create collection plays_by_user_simple')
invoke_measurable_task(
    lambda: plays_by_user_binary(), 'Create collection plays_by_user_binary')
invoke_measurable_task(
    lambda: plays_by_user_filtered(), 'Create collection plays_by_user_filtered')
        if len(plays_batch) % BATCH_SIZE == 0:
            insert_batch(plays_batch, target_collection)
            plays_batch = []

    if len(plays_batch) > 0:
        insert_batch(plays_batch, target_collection)


with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local

    most_popular_songs = [1]

    def load_most_popular_songs():
        most_popular_songs[0] = list(
            db.play_count_by_song_t.find()
            .sort('value', DESCENDING)
            .limit(NUMBER_OF_MOST_POPULAR_SONGS)
        )

    invoke_measurable_task(load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS)
    most_popular_songs = most_popular_songs[0]

    invoke_measurable_task(
        lambda: create_plays_by_user_binary(db.plays_by_user_t, most_popular_songs, db.plays_by_user_binary_t),
        "Create plays_by_user_binary collection for train set")

    invoke_measurable_task(
        lambda: create_plays_by_user_binary(db.plays_by_user_v, most_popular_songs, db.plays_by_user_binary_v),
        "Create plays_by_user_binary collection for validation set")
Exemplo n.º 16
0
                song_number = 1
                for song_id in songs.iterkeys():
                    if song_number >= 500:
                        break

                    file.write(str(' ' + str(song_id)))
                    song_number += 1

                file.write('\n')


buckets = [1]
plays_for_validated_users = [1]

with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local

    def load_buckets(): buckets[0] = {x['_id']: x['value'] for x in list(db.plays_by_most_often_played_song_t.find())}
    invoke_measurable_task(load_buckets, 'Load buckets')

    def load_plays_for_all_users(): plays_for_validated_users[0] = list(db.plays_by_user_simple_v.find().limit(NUMBER_OF_USERS))
    invoke_measurable_task(load_plays_for_all_users, 'Load plays for validated users')


buckets = buckets[0]
plays_for_validated_users = plays_for_validated_users[0]

def find_knn_scores():
    knn = LshOptimizedJaccardBasedKnn(buckets, KNN_K, MIN_SIMILARITY)
    knn.find_for_users(plays_for_validated_users)
invoke_measurable_task(find_knn_scores, 'Find knn for %d users' % NUMBER_OF_USERS)
    naive_bayes(arguments)


def save_to_file(collection):
    f = open(WORK_FILE, 'a')
    f.truncate()
    for song in collection:
        for song_id in song[u'value']:
            val = (song[u'value'][song_id])
            f.seek(0)
            f.write(str(val))
            f.write(' ')
        f.write('\n')

    f.close()


with MongoClient('localhost', MONGODB_PORT) as client:
    collection = [1]

    def load_collection():
        collection[0] = list(db.plays_by_user_binary_t.find().limit(COUNT))

    db = client.local

    invoke_measurable_task(lambda: load_collection(), 'Get collection')

    invoke_measurable_task(lambda: save_to_file(collection[0]),
                           'Save collection to file')

    invoke_measurable_task(lambda: learn_bayes(WORK_FILE), 'Teach Naive Bayes')
    f.truncate()
    for song in collection:
        for song_id in song[u'value']:
            val = (song[u'value'][song_id])
            f.seek(0)
            f.write(str(val))
            f.write(' ')
        f.write('\n')

    f.close()


with MongoClient('localhost', MONGODB_PORT) as client:
    collection = [1]

    def load_collection():
        collection[0] = list(
            db.plays_by_user_binary_t.find().limit(COUNT)
        )

    db = client.local

    invoke_measurable_task(
        lambda: load_collection(), 'Get collection')

    invoke_measurable_task(
        lambda: save_to_file(collection[0]), 'Save collection to file')

    invoke_measurable_task(
        lambda: learn_bayes(WORK_FILE),
        'Teach Naive Bayes')
        for line in knn_results:
            print i
            knn_map.append(get_data_from_fileline(line))
            i += 1


with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local
    most_popular_songs = [1]

    def load_most_popular_songs():
        most_popular_songs[0] = list(db.play_count_by_song_t.find().sort(
            'value', DESCENDING).limit(NUMBER_OF_MOST_POPULAR_SONGS))

    invoke_measurable_task(
        load_most_popular_songs,
        'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS)
    most_popular_songs = most_popular_songs[0]

    load_knn_results()

    with open(KAGGLE_USERS_MAPPING_FILE_PATH) as kaggle_users:
        with open(KNN_CONVERTED_PATH, 'w') as my_file:
            my_file.write('Id,Expected\n')
            j = 1
            for index, user_id in enumerate(kaggle_users):
                my_file.write(user_id.strip() + ',')
                result = knn_map[index]
                index += 1

                predicted_songs = 0
            '_id': plays_by_user['_id'],
            'value': [song_id for song_id in plays_by_user['value']]
        }

        # sort by plays count
        entry['value'].sort(key=lambda x: plays_by_user['value'][x],
                            reverse=True)

        if len(entry['value']) > 1:
            plays_batch.append(entry)

        if len(plays_batch) > 0 and len(plays_batch) % BATCH_SIZE == 0:
            insert_batch(plays_batch, target_collection)
            plays_batch = []

    if len(plays_batch) > 0:
        insert_batch(plays_batch, target_collection)


with MongoClient('localhost', MONGODB_PORT) as client:
    db = client.local

    invoke_measurable_task(
        lambda: create_plays_by_user_filtered_simple(db.plays_by_user_t, db.
                                                     plays_by_user_simple_t),
        "Create plays_by_user_simple_filtered collection for train set")

    invoke_measurable_task(
        lambda: create_plays_by_user_filtered_simple(db.plays_by_user_v, db.
                                                     plays_by_user_simple_v),
        "Create plays_by_user_simple_filtered collection for validation set")