Пример #1
0
from utils.utils import extract_features

# _movies_bof = load_features('content/bof_128.bin')
# print _movies_bof[9089]
_movies_bof_normalized = extract_features('content/bof_128.bin')
# print _movies_bof_normalized[9089]

conn = sqlite3.connect('content/database.db')

_user_ratings = pd.read_sql("SELECT r.userid, t.id "
                            "FROM movielens_rating r "
                            "JOIN movielens_movie m ON m.movielensid = r.movielensid "
                            "JOIN trailers t ON t.imdbid = m.imdbidtt "
                            "AND t.best_file = 1 "
                            "WHERE r.rating > 4 "
                            "AND r.userid < 5000 "
                            "ORDER BY r.userid", conn)
users_bof = {}
_users = _user_ratings['userID'].unique()

for user in _users:
    users_bof[user] = []

    _current_user_ratings = _user_ratings[_user_ratings['userID'] == user]

    for key, item in _current_user_ratings.iterrows():
        item_bof = _movies_bof_normalized[item['id']]
        users_bof[user].append(item_bof)

save_obj(users_bof, '3112_users_bof')
Пример #2
0
count = 0
_safe_exit = 2

trailer_tfidf_similarities = dict()

for i in range(0, len(tfidf_array)):
    # print sum(tfidf_array[i])
    trailer_id = _all_ratings.iloc[i]
    print trailer_id
    trailer_tfidf_similarities[trailer_id[0]] = {}
    # trailer_tfidf_similarities[trailer_id[0]] = []

    for j in range(0, len(tfidf_array)):

        # if i == j:  # avoid self-comparison
        #    continue

        sim = cosine_similarity([tfidf_array[i]], [tfidf_array[j]])
        # trailer_tfidf_similarities[trailer_id[0]].append((_all_ratings.iloc[j][0], sim[0][0]))
        trailer_tfidf_similarities[trailer_id[0]][_all_ratings.iloc[j]
                                                  [0]] = sim[0][0]

    # trailer_tfidf_similarities[trailer_id[0]] = sort_desc(trailer_tfidf_similarities[trailer_id[0]])

    # count += 1
    # if count == _safe_exit:
    #     break

# print trailer_tfidf_similarities
save_obj(trailer_tfidf_similarities, 'trailer_tfidf_synopsis_similarities')
Пример #3
0
            continue

        intersect = pd.merge(_all_ratings[_all_ratings['id'] == movie],
                             _all_ratings[_all_ratings['id'] == neighbor],
                             on='userID')
        # print intersect
        # exit()

        # if len(intersect) > 4:
        if not intersect.empty:

            try:
                # sim = cosine_similarity(intersect['rating_x'].reshape(1, -1), intersect['rating_y'].reshape(1, -1))
                # sim = cosine_similarity([intersect['rating_x']], [intersect['rating_y']])

                sim = adjusted_cosine(intersect, user_profiles)
                movie_similarity[movie].append((neighbor, sim))

                # sim = cosine_similarity(intersect['rating_x'].reshape(1, -1), intersect['rating_y'].reshape(1, -1))
                # movie_similarity[movie].append((neighbor, sim[0][0]))
            except ValueError:
                continue
        else:
            movie_similarity[movie].append((neighbor, 0))

        movie_similarity[movie] = sort_desc(movie_similarity[movie])
    # print movie_similarity[movie]
    # break

save_obj(movie_similarity, 'item_item_collaborative_similarities')
Пример #4
0
            'precision': hducp,
            'recall': hducr,
            'diversity': hducd,
            'mae': hducm,
            'rankscore': hducrs,
            'f1': hducf1
        },
        # 'switching-hybrid': {'precision': swp, 'recall': swr, 'diversity': swd, 'mae': swm, 'rankscore': swrs, 'f1': swf1},

        # 'linear-regression': {'precision': lrp, 'recall': lrr, 'diversity': lrd, 'mae': lrm}
        # 'weighted-hybrid-content-item': {'precision': h2p, 'recall': h2r, 'diversity': h2d, 'mae': h2m, 'rankscore': h2rs, 'f1': h2f1},
        # 'weighted-hybrid-collaborative': {'precision': h3p, 'recall': h3r, 'diversity': h3d, 'mae': h3m, 'rankscore': h3rs, 'f1': h3f1},
    }


results = {}

for index in range(2, 16):
    results[index] = experiment(index, new_user_profiles, convnet_sim_matrix,
                                low_level_sim_matrix,
                                _trailers_tfidf_sims_matrix,
                                _trailers_tfidf_synopsis_sims_matrix)

# print results
print results[15]

save_obj(new_user_profiles, 'profiles_with_predictions')
save_obj(results, 'results_50_users')
end = time.time()
print "Execution time", (end - start), "seconds."
item_similarities = dict()

for key, target_movie_movielens_id in _all_movies.iterrows():

    other_items = _all_movies.iloc[key + 1:]
    similarities = []

    for sub_key, neighbour_movie_id in other_items.iterrows():

        join_ratings = pd.merge(_all_ratings[_all_ratings['movielensID'] == neighbour_movie_id.iloc[0]],
                                _all_ratings[_all_ratings['movielensID'] == target_movie_movielens_id.iloc[0]], on='userID')
        sim = 0

        if len(join_ratings) > 0:
            ratings_x = np.array(join_ratings['rating_x'])
            ratings_y = np.array(join_ratings['rating_y'])
            sim = cosine_similarity(ratings_x.reshape(1, -1), ratings_y.reshape(1, -1))[0][0]
        similarities.append((neighbour_movie_id, sim))

    ordered = sort_desc(similarities)[:30]
    item_similarities[target_movie_movielens_id.iloc[0]] = ordered

    count += 1
    if count % 100 == 0:
        print count, "movies read"
    break

# print item_similarities
print "finished in", time.time() - start, "seconds"
save_obj(item_similarities, 'item_collaborative_similarity')
        for movie in user_movies:

            try:
                new_movie_vector = np.insert(_deep_features_bof[movie], 0, 1)
            except KeyError:
                continue

            rating = _all_ratings[(_all_ratings['userID'] == user) & (
                _all_ratings['id'] == movie)]['rating'].iloc[0]

            theta_vectors[user][0] -= _alpha * (theta_vectors[user].reshape(
                -1, 129).dot(new_movie_vector.reshape(129, -1))[0][0] -
                                                rating) * new_movie_vector[0]

            # for every theta (weight) value
            for index in range(1, len(theta_vectors[user])):

                part1 = (theta_vectors[user].reshape(-1, 129).dot(
                    new_movie_vector.reshape(129, -1))[0][0] - rating)
                theta_vectors[user][index] -= _alpha * (
                    part1 * new_movie_vector[index] +
                    _lambda * theta_vectors[user][index])

    print "user", user
    # break

# print "modified", theta_vectors[user]

save_obj(theta_vectors, 'users_theta_vectors')
for key, movie in _all_ratings.iterrows():

    movie_tag_vector = []
    print movie[0]

    for subkey, tag in _all_tags.iterrows():

        c = conn.cursor()
        count_tag = c.execute(sql_count_tags, (
            movie[0],
            tag[0],
        ))
        movie_count_tags = count_tag.fetchall()
        movie_tag_vector.append(movie_count_tags[0][0])

    movies_tag_vectors.append(movie_tag_vector)

    # print movie_tags

    # count += 1
    # if count == _safe_exit:
    #     break

# for movie_counts in movies_tag_vectors:
#     print sum(movie_counts)
tfidf = transformer.fit_transform(movies_tag_vectors)
# print tfidf.toarray()

save_obj(tfidf.toarray(), 'movies_tfidf_array')
Пример #8
0
from hausdorff import hausdorff
from utils.utils import sort_desc
import numpy as np

_users_bof = load_features('content/3112_users_bof.pkl')

# test_user_1 = np.array(_users_bof[1])
# test_user_3 = np.array(_users_bof[7])
# print test_user_1
# print hausdorff(test_user_1, test_user_3)

users_bof_similarities = {}

for key, user_bof in _users_bof.iteritems():
    users_bof_similarities[key] = []
    print "current user", key

    for neighbor, neighbor_bof in _users_bof.iteritems():
        if neighbor == key:
            continue

        sim = hausdorff(np.array(user_bof), np.array(neighbor_bof))
        users_bof_similarities[key].append((neighbor, sim))

    users_bof_similarities[key] = sort_desc(users_bof_similarities[key],
                                            desc=False)
    # print users_bof_similarities[key]
    # break

save_obj(users_bof_similarities, '3112_user_user_bof_similarities')
import sqlite3
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from utils.opening_feat import save_obj


v = TfidfVectorizer()

conn = sqlite3.connect('/home/ralph/Dev/content-based-recsys/content/database.db')

_all_movies = pd.read_sql('select distinct t.id, ms.Plot '
                          'from movielens_rating r '
                          'join movielens_movie m on m.movielensid = r.movielensid '
                          'join trailers t on t.imdbid = m.imdbidtt '
                          'join movies ms on ms.imdbID = t.imdbid '
                          'where t.best_file = 1 '
                          # 'and userid < 5000 '
                          'order by t.id ', conn)

plots = []

for key, movie in _all_movies.iterrows():
    # print movie['Plot']
    print key
    plots.append(movie['Plot'])

x = v.fit_transform(plots)

save_obj(x.toarray(), 'movies_tfidf_synopsis_array')
for user in _3112_user_ratings['userID'].unique():
    _movies = c.execute(_movies_sql)
    column = 0
    for movie in _movies.fetchall():
        _user_rating = get_user_rating(_3112_user_ratings, user, movie[0])
        if not _user_rating.empty:
            _ratings_matrix[row][column] = _user_rating['rating'].iloc[0]
        column += 1
    row += 1

# print _ratings_matrix[0]
df = pd.DataFrame(_ratings_matrix)
# print df
df.fillna(df.mean(), inplace=True)
# print df
# print _ratings_matrix
# exit()

# Salva a matriz completa considerando a media de cada item nas celulas vazias
save_obj(df, 'full_matrix_for_svd')

# print full_ratings
# exit()
# scaled = preprocessing.scale(np.matrix(full_ratings))
# full_matrix = np.nan_to_num(np.array(full_ratings))
# normalized = preprocessing.normalize(full_matrix, norm='l2')

# print np.matrix(full_ratings)
# np.savetxt('full_matrix_for_svd_item_mean_imputation', np.matrix(full_ratings))
# np.savetxt('full_matrix_for_svd_normalized', normalized)
            neighbor_average = user_profiles.loc[neighbor]['avg']
        except IndexError as e:
            print e, "neighbor", neighbor, "failed"

        try:
            intersect = pd.merge(
                _all_ratings[_all_ratings['userID'] == neighbor],
                target_user_ratings,
                on='id')

            if len(intersect) < 5:
                sim = 0
            else:
                sim = pearsonr(intersect['rating_x'], intersect['rating_y'])[0]
                # ssim = sum([(item['rating_x'] - neighbor_average) * (item['rating_y'] - target_user_average)
                #             for k, item in intersect.iterrows()]) / (
                #     math.sqrt(sum([(item['rating_x'] - neighbor_average) ** 2 for k, item in intersect.iterrows()])) *
                #     math.sqrt(sum([(item['rating_y'] - target_user_average) ** 2 for k, item in intersect.iterrows()])))
        except ValueError:
            sim = 0

        if not (sim > 0 or sim < 0):
            sim = 0

        user_user_similarities[user].append((neighbor, sim))

    user_user_similarities[user] = sort_desc(user_user_similarities[user])
    # break

save_obj(user_user_similarities, 'user_user_collaborative_similarities')