示例#1
0
def get_results(dataset_name, coldstart, cs_type='none', n_entries=0):

    # Get dataset
    dataset = get_dataset(dataset_name)

    models = []

    # Hybrid Model
    from hybrid_model.hybrid import HybridModel
    from hybrid_model.config import hybrid_config

    model_type = HybridModel
    config = hybrid_config
    models.append(EvalModel(model_type.__name__, model_type, config))

    # Bias Baseline
    from hybrid_model.models import BiasEstimator
    model_type = BiasEstimator
    config = {}
    models.append(EvalModel(model_type.__name__, model_type, config))

    # SVD
    from hybrid_model.models import SVD
    model_type = SVD
    config = {}
    models.append(EvalModel(model_type.__name__, model_type, config))

    results = evaluate_models_xval(dataset,
                                   models,
                                   coldstart=coldstart,
                                   cs_type=cs_type,
                                   n_entries=n_entries)

    return results
示例#2
0
def analyze(ds_name):

    ds = dataset.get_dataset(ds_name)
    (inds_u, inds_i, y, users_features, items_features) = ds.data
    (users_desc, items_desc, users_features_desc,
     items_features_desc) = dataset.get_dataset_desc(ds_name)

    n_users = ds.n_users
    n_items = ds.n_items

    n_users_features = len(users_features_desc)
    n_items_features = len(items_features_desc)

    # Sanity checks
    assert (n_users, n_users_features) == users_features.shape
    assert (n_items, n_items_features) == items_features.shape

    matrix = np.zeros((n_users, n_items))

    for u, i, r in zip(inds_u, inds_i, y):
        matrix[u, i] = r

    entries = len(matrix.nonzero()[0])
    sparsity = float(entries)
    sparsity /= (matrix.shape[0] * matrix.shape[1])
    print('Number of users {}'.format(n_users))
    print('Number of Items {}'.format(n_items))
    print('Total valid entries {}'.format(entries))
    print('Sparsity {:4.4f}%'.format(sparsity * 100))
    items_per_user_avg = np.mean(np.sum((matrix != 0).astype(np.int), 1))
    users_per_item_avg = np.mean(np.sum((matrix != 0).astype(np.int), 0))
    print('Average number of items per user {}'.format(items_per_user_avg))
    print('Average number of users per item {}'.format(users_per_item_avg))

    users_witout_items = np.sum(np.sum((matrix != 0), 1) == 0)
    print('Users without any items {}'.format(users_witout_items))

    items_without_users = np.sum(np.sum((matrix != 0), 0) == 0)
    print('Items without any users {}'.format(items_without_users))

    # # Sparsify
    # # Delete users and items without valid matrix
    # sales_sparse = matrix[~np.all(matrix == 0, 1), :]
    # sales_sparse = sales_sparse[:, ~np.all(matrix == 0, 0)]
    #
    # entries_sparse = len(sales_sparse.nonzero()[0])
    # sparsity_sparse = float(entries)
    # sparsity_sparse /= (sales_sparse.shape[0] * sales_sparse.shape[1])
    # print('Removing users and items without matrix:')
    # print('Number of users {}'.format(sales_sparse.shape[0]))
    # print('Number of Items {}'.format(sales_sparse.shape[1]))
    # print('Sparsity after removal of users without matrix {:4.4f}%'.format(sparsity_sparse * 100))
    # items_per_user_avg = np.mean(np.sum((sales_sparse != 0), 1))
    # users_per_item_avg = np.mean(np.sum((sales_sparse != 0), 0))
    # print('Average number of items per user {}'.format(items_per_user_avg))
    # print('Average number of users per item {}'.format(users_per_item_avg))

    ga = np.mean(y)
    y_shift = y - ga
    print('Global Average: {:4.4f}'.format(ga))

    user_stats = pd.DataFrame([], index=users_features_desc)
    user_stats['# users'] = np.sum(users_features, 0)
    user_stats['# interactions'] = np.sum(users_features[inds_u, :], 0)
    user_stats['avg rating'] = users_features[inds_u, :].T @ y_shift / np.sum(
        users_features[inds_u, :], 0)
    print(user_stats)

    item_stats = pd.DataFrame([], index=items_features_desc)
    item_stats['# items'] = np.sum(items_features, 0)
    item_stats['# interactions'] = np.sum(items_features[inds_i, :], 0)
    item_stats['avg rating'] = items_features[inds_i, :].T @ y_shift / np.sum(
        items_features[inds_i, :], 0)
    print(item_stats)

    user_feature_stats = pd.DataFrame(
        (users_features.T @ users_features)[7:, :9],
        index=users_features_desc[7:],
        columns=users_features_desc[:9])
    print(user_feature_stats)

    item_feature_stats = pd.DataFrame((items_features.T @ items_features),
                                      index=items_features_desc,
                                      columns=items_features_desc)
    print(item_feature_stats)

    concat_features_desc = users_features_desc + items_features_desc
    concat_features_interactions = np.concatenate(
        (users_features[inds_u, :], items_features[inds_i, :]), 1)
    feature_interactions = pd.DataFrame(
        concat_features_interactions.T @ concat_features_interactions,
        index=concat_features_desc,
        columns=concat_features_desc)
    print(feature_interactions)

    feature_corr = pd.DataFrame(
        (concat_features_interactions.T * y_shift)
        @ concat_features_interactions / feature_interactions.values,
        index=concat_features_desc,
        columns=concat_features_desc)
    print(feature_corr)
示例#3
0
import script_chdir
import numpy as np
import results.plots as lplot
import matplotlib.pyplot as plt

from hybrid_model.dataset import get_dataset
from hybrid_model.index_sampler import IndexSamplerUserItembased as IndexSampler

dataset = get_dataset('ml100k')
(inds_u, inds_i, y, users_features, items_features) = dataset.data

# mat = np.zeros((dataset.n_users, dataset.n_items), np.float)
#
# for u, i in zip(inds_u, inds_i):
#     mat[u, i] = 1.0
#
# # Get user/item distributions and order
# dist_users = np.sum(mat, axis=1).astype(np.int)
# dist_items = np.sum(mat, axis=0).astype(np.int)

user_dist = np.bincount(inds_u, minlength=dataset.n_users)
item_dist = np.bincount(inds_i, minlength=dataset.n_items)

order_users = np.argsort(-user_dist)
order_items = np.argsort(-item_dist)

dist_users = user_dist[order_users]
dist_items = item_dist[order_items]

inds_u = np.argsort(order_users)[inds_u]
inds_i = np.argsort(order_items)[inds_i]