예제 #1
0
def get_group_ratings():
    """
	Returns a [p, m, r] array containing the ratings given by each group to each item 
	p = number of groups
	m = number of items
	r = number of rating values
	"""
    cachefile = group_ratings_cache_file
    if os.path.isfile(cachefile):
        with msg(f'Reading group ratings from "{cachefile}"'):
            return np.load(cachefile)

    with msg('Getting group ratings'):
        R = get_R()
        P = get_groups()
        item_count = R.shape[1]
        rating_count = rating_value_count

        ratings = np.zeros((number_of_groups(), item_count, rating_count),
                           dtype=np.float32)
        with msg(f'Calculating rating counts'):
            for group_n, group in enumerate(P):
                for rating_index in range(rating_count):
                    rating = rating_index + 1
                    ratings[group_n, :,
                            rating_index] = (R[group] == rating).sum(axis=0)

        with msg(f'Saving group ratings to "{cachefile}"'):
            np.save(cachefile, ratings)

        return ratings
def total_rmse():
    group_count = DataReader.nym_count()
    item_count = R.shape[1]
    total_rmse = 0

    item_lam = lam.sum(axis=0)
    highest_n = 500
    large_items = np.argpartition(item_lam, -highest_n)[-highest_n:]

    with msg('Splitting group ratings'):
        group_ratings = []
        for group in range(group_count):
            group_ratings.append(R[P[group]])

    with msg('Getting rmse(s)'):
        count = 0
        for nth_item, item in enumerate(large_items):
            for group in range(group_count):
                mean = Rtilde[group, item]
                # if mean < 3.5 and mean > 2.5:
                # if mean > 4:
                if True:
                    count += 1
                    data = group_ratings[group][:, item].data
                    var = Rvar[group, item]
                    if var == 0: var = 0.01
                    total_rmse += get_rmse(data, mean, var)

            if (nth_item) % 10 == 0:
                mean_rmse = total_rmse / (count)
                print(f'[{nth_item}, {count}] Mean RMSE: {mean_rmse}')
예제 #3
0
    def get_group_rating_distributions():
        cachefile = DataReader.group_rating_dists_cache_file
        if os.path.isfile(cachefile):
            with msg(
                    f'Reading distribution of ratings for each item per group from "{cachefile}"'
            ):
                return np.load(cachefile)

        with msg('Getting distribution of ratings for each item and group'):
            R = DataReader.get_ratings()
            P = DataReader.get_nyms()
            group_count = DataReader.nym_count()
            item_count = R.shape[1]
            rating_count = DataReader.rating_value_count

            dists = np.zeros((group_count, item_count, rating_count),
                             dtype=np.float32)
            for group_n, group in enumerate(P):
                with msg(f'Calculating group {group_n} distributions'):
                    R_g = R[group].tocoo()
                    for item, rating in zip(R_g.col, R_g.data):
                        dists[group_n, item, int(rating - 1)] += 1

            with msg('Normalising distributions'):
                dists /= dists.sum(axis=2, keepdims=True)
                # nan's imply 0 ratings by group on item, so give equal distribution
                dists[np.isnan(dists)] = 1.0 / rating_count

            with msg(f'Saving distribution of ratings to "{cachefile}"'):
                np.save(cachefile, dists)

            return dists
예제 #4
0
def barplot_rating_dist(item, single=False, group=None, savefig=None):

	with msg("plotting rating distribution"):
		ratings = Data.get_ratings()[:,item]
		nyms = Data.get_nyms()

		plt.xlabel('rating')
		plt.ylabel('no. ratings')
		step = 1
		bins = np.arange(step/2, 5 + 1.5*step, step)
		hist = lambda d, **kwargs: plt.hist(d, bins=bins, rwidth=step*0.75, **kwargs)
		if group is not None: 
			plt.title(f'Item {item}, group {group} rating distribution')
			hist(ratings[nyms[group]].data)
		elif single: 
			plt.title(f'Item {item} rating distribution')
			hist(ratings.data)
		else:
			plt.title(f'Item {item}, all groups rating distributions')
			for nym_n, nym in enumerate(nyms):
				hist(ratings[nym].data, histtype='step', linewidth=2 ,label=f'group {nym_n}')
			plt.legend()
		if savefig is None:
			plt.show()
		else:
			with msg(f'Saving figure to "{savefig}"'):
				plt.savefig(savefig, dpi=150)
			plt.clf()
예제 #5
0
def save_data(ratings, user_ids, movie_ids):
    with msg(f'Saving ratings to "{ratings_file}"'):
        sp.save_npz(ratings_file, ratings)
    with msg(f'Saving original user ids to "{users_file}"'):
        np.save(users_file, user_ids)
    with msg(f'Saving original movie ids to "{movies_file}"'):
        np.save(movies_file, movie_ids)
예제 #6
0
    def get_nym_stats():
        """ Returns statistics about rating distributions of all items for each nym,
		as a 3d numpy array [nym number, item number, <stat>] (type np.float32),
		where <stat> index
		  0 : item index
		  1 : distribution mean
		  2 : distribution variance
		  3 : number of ratings
		Cached result to allow single load on multiple calls.
		"""
        filename = DataReader.nym_stats_cache_file
        if os.path.isfile(filename):
            with msg(f'Reading nym stats from "{filename}"'):
                stats = np.load(filename)
        else:
            ratings = DataReader.get_ratings()
            nyms = DataReader.get_nyms()
            stats = np.zeros((len(nyms), ratings.shape[1], 4),
                             dtype=np.float32)
            for nym_n, nym in enumerate(nyms):
                with msg(f'Getting nym #{nym_n} stats'):
                    for i, items in enumerate(ratings[nym].T):
                        data = items.data
                        stats[nym_n, i, 0] = i
                        stats[nym_n, i,
                              1] = data.mean() if len(data) > 0 else 0
                        stats[nym_n, i, 2] = data.var() if len(data) > 0 else 0
                        stats[nym_n, i, 3] = len(data)
            with msg(f'Saving nym stats to "{filename}"'):
                np.save(filename, stats)
        return stats
예제 #7
0
def get_P():
    filename = P_file
    if os.path.isfile(filename + '.npz'):
        with msg(f'Reading "{filename}.npz"'):
            return sp.load_npz(filename + '.npz').toarray()
    elif os.path.isfile(filename + '.npy'):
        with msg(f'Reading "{filename}.npy"'):
            return sp.load(filename + '.npy')
예제 #8
0
def prepare_ratings(ratings):
    with msg('Preparing ratings'):
        with msg('Converting to csc matrix'):
            ratings = ratings.tocsc(copy=False)
        with msg('Removing empty cols'):
            non_zero_cols = ratings.getnnz(0) > 0
            ratings = ratings[:, non_zero_cols]

        with msg('Converting to csr matrix'):
            ratings = ratings.tocsr(copy=False)
        with msg('Removing empty rows'):
            non_zero_rows = ratings.getnnz(1) > 0
            ratings = ratings[non_zero_rows]

        return ratings, np.where(non_zero_rows)[0], np.where(non_zero_cols)[0]
예제 #9
0
 def read_numpy_file(filename, dtype=np.float32):
     """ Read from numpy file if it exists, otherwise from raw text file """
     with msg(f'Reading "{filename}"'):
         if os.path.isfile(filename + ".npy"):
             return np.load(filename + ".npy")
         else:
             return np.loadtxt(open(filename + ".txt", "r"), dtype=dtype)
예제 #10
0
def get_groups():
    """ Returns the groups as a list of numpy arrays.
	Cached result to allow single load on multiple calls.
	"""
    P = get_P()
    with msg(f'Converting to list of user indexes by group'):
        indexes = np.arange(P.shape[1])
        return [indexes[group] for group in P.astype(bool)]
예제 #11
0
def save_fig(f, label, tag=None):
    if tag is None: tag = ''
    else: tag = f'-{tag}'
    plot_name = f"{label.replace(' ', '-')}{tag}.png"
    fname = f + plot_name
    with msg("saving",fname):
        plt.savefig(fname, bbox_inches='tight')
        plt.clf()
예제 #12
0
def plot_nym_stat(thresh=thresh_default, inv=False, savefig=False, outfile=outfile_default, begin=None, num=None, stat_option=stat_option_default):
	stat_name = stat_options[stat_option]
	if inv: stat_name = f'inverse {stat_name}'
	
	fig, ax = plt.subplots()
	ax.set(
		# ylim=(0, None),
		title=f'{stat_name} of each group by item number (thresh no. ratings >= {thresh})',
		xlabel='item number',
		ylabel=stat_name)
	
	cm = plt.get_cmap('gist_rainbow')
	colors = [cm(1.*i/Data.nym_count()) for i in range(Data.nym_count())]

	begin = 0 if begin is None else begin
	end = None if num is None else begin + num 
	nym_stats = Data.get_nym_stats()[:, begin : (None if num is None else begin+num),:]

	for nym_n in range(Data.nym_count()):
		nym_n_stats = nym_stats[nym_n]
		with msg(f'plotting nym #{nym_n} {stat_name}'):

			valids = (nym_n_stats[:,3] >= thresh)
			print(f'{valids.sum()} of {len(valids)} valid (thresh = {thresh})')

			x = nym_n_stats[:,0][valids]
			if stat_option is 1:
				y = nym_n_stats[:,1][valids]
			elif stat_option is 2:
				y = nym_n_stats[:,2][valids]
			elif stat_option is 3:
				y = np.sqrt(nym_n_stats[:,2][valids])

			if inv: y[y > 0] = 1 / y[y > 0]
			s = np.sqrt(nym_n_stats[:,3][valids])

			ax.scatter(x, y, s=s, facecolors='none', edgecolors=colors[nym_n], label=f'group {nym_n}')
	ax.legend()

	if savefig:
		with msg('Saving "{}" to "{}"'.format(ax.title.get_text(), outfile)):
			ax.get_figure().savefig(outfile, dpi=150)
			plt.clf()
	else:
		plt.show()
예제 #13
0
def gen_test_data(user_groups=None, item_n=100, subdir=''):
    """ Generate presistent data for testing """
    items = GroupRatings()
    item_count = items.item_count()

    user_n = user_groups.shape[0]
    test_item_means = np.zeros((user_n, items.n_groups, item_n))
    test_item_ratings = np.zeros((user_n, item_n))
    test_item_ids = np.zeros((user_n, item_n), dtype=int)

    lam_dist = items.lam() / items.lam().sum(axis=1, keepdims=True)
    dists = items.dist()
    with msg('Generating test data'):
        for n, group in enumerate(user_groups):
            # choose test items
            item_ids = np.random.choice(item_count,
                                        size=item_n,
                                        p=lam_dist[group])
            test_item_ids[n] = items.items[item_ids]

            # get group proxy ratings
            items.items = item_ids
            test_item_means[n] = items.mean()
            items.reset()

            # get sampled ratings
            for i, item_id in enumerate(item_ids):
                rating_dist = dists[group, item_id]
                test_item_ratings[n, i] = np.random.choice(items.n_rating_vals,
                                                           p=rating_dist) + 1

    means_file = test_item_dist_means_file.format(subdir, user_n)
    sampled_file = test_sampled_ratings_file.format(subdir, user_n)
    item_ids_file = test_item_ids_file.format(subdir, user_n)

    with msg(f'Saving test item dist means to {means_file}'):
        np.save(means_file, test_item_means)
    with msg(f'Saving test item sampled ratings to {sampled_file}'):
        np.save(sampled_file, test_item_ratings)
    with msg(f'Saving test item ids to {item_ids_file}'):
        np.save(item_ids_file, test_item_ids)

    return test_item_means, test_item_ratings, test_item_ids
예제 #14
0
    def get_nyms():
        """ Returns the nyms as a list of numpy arrays.
		Cached result to allow single load on multiple calls.
		"""
        filename = DataReader.nyms_file
        with msg(f'Reading nyms from "{filename}"'), open(filename, 'r') as f:
            nyms_raw = np.loadtxt(f, delimiter=',', dtype=int)
            # parse into list of nyms
            nym_count = nyms_raw[:, 1].max() + 1
            return [
                nyms_raw[:, 0][nyms_raw[:, 1] == nym_n]
                for nym_n in range(0, nym_count)
            ]
예제 #15
0
    def get_ratings():
        """ Returns the ratings matrix in compressed sparse column (csc) format.
		Stores csc matrix to ratings_cache_file for faster loading in future.
		Cached result to allow single load on multiple calls. 
		"""
        filename = DataReader.ratings_file
        if os.path.isfile(filename):
            with msg(f'Loading rating matrix from "{filename}"'):
                return sp.load_npz(filename)
        else:
            raise RuntimeError(
                f'"{filename}" does not exist. Use "netflix_data.py" to generate it.'
            )
예제 #16
0
def parse_ratings(zipfile=netflix_data):
    filecount = 4
    basefilename = 'combined_data_{}.txt'
    ratingfiles = [basefilename.format(i) for i in range(1, filecount + 1)]

    row, col, data = [], [], []
    item_id = 1
    with msg(f'Reading from "{netflix_data}"'), ZipFile(zipfile, 'r') as myzip:
        for filename in ratingfiles:
            with msg(f'Parsing "{filename}"'), myzip.open(filename,
                                                          'r') as file:
                for line in TextIOWrapper(file):
                    tokens = line.split(',')
                    if len(tokens) == 3:
                        row.append(int(tokens[0]))
                        col.append(item_id)
                        data.append(int(tokens[1]))
                    else:
                        item_id = int(
                            line[:-2]
                        )  # -2 to remove ':' and newline at end of line

    with msg('Creating sparse matrix from ratings'):
        return sp.coo_matrix((data, (row, col)), dtype=np.float32)
예제 #17
0
def heatmap_rating_dist(item):
	# def plot_rating_dists_across_groups(ratings, item, groups, savefig=False):
	with msg("plotting rating distribution"):
		ratings = Data.get_ratings()[:,item]
		nyms = Data.get_nyms()

		data = np.zeros((10, len(nyms)))
		for nym_n, nym in enumerate(nyms):
			unique, count = np.unique(ratings[nym].data, return_counts=True)
			for rating, count in dict(zip(unique, count)).items():
				data[int(2*rating - 1), nym_n] = count

		ax = sns.heatmap(data)
		ax.set(
			title="Distribution of item #{} ratings by group".format(int(item)),
			xlabel="group number", 
			ylabel="rating", 
			yticklabels=np.linspace(0.5, 5, 10))
		
		plt.show()
예제 #18
0
import numpy as np
import matplotlib.pyplot as plt

from myutils import msg
from datareader import DataReader
from dist_model import DiscreteNormal as DiscNorm

rating_count = 5
dist_gen = DiscNorm(np.linspace(0.5, 5.5, num=rating_count + 1))

with msg("Getting data"):
    Rtilde = DataReader.get_Rtilde()
    Rvar = DataReader.get_Rvar()
    R = DataReader.get_ratings()
    lam = DataReader.get_lam()
    P = DataReader.get_nyms()


def get_data_dist(data):
    ratings, counts = np.unique(data, return_counts=True)
    dist_data = np.zeros(rating_count)
    dist_data[ratings.astype(int) - 1] = counts / counts.sum()
    return dist_data


def get_err(data, mean, var):
    dist_data = get_data_dist(data)
    dist_model = dist_gen.pmf(mean, var)
    return abs(dist_data / dist_model)

예제 #19
0
def run(label=None, user_n=500, sample_n=500, n_points=default_n_points, correct_error=False, thresh=60, best_n=100, with_rmse=True, 
        with_accuracy=True, save_points=True, baseline=True, weight=False, plot_spread=False, plot=True, hard_memb=False):

    # pick item pool
    with msg('Configuring item pool'):
        g = GroupRatings()
        g.thresh(thresh)

        if label is None: label = 'sampling by pop'
        if label == 'highest pop':      g.highest_pop(best_n)
        if label == 'lowest pop':       g.lowest_pop(best_n)
        if label == 'highest variance': g.highest_var(best_n)
        if label == 'lowest variance':  g.lowest_var(best_n)
        if label == 'lowest entropy':   g.lowest_entropy(best_n)
        if label == 'highest 2-norm':   g.highest_pnorm(best_n)
        if label == 'highest max-norm': g.highest_maxnorm(best_n);

    # generate user data
    with msg('Generating users'):
        users = Users(training=g, user_n=user_n)

    priors = g.group_size_dist()
    liklihoods = g.dist()
    max_accuracies = np.zeros(n_points)
    median_accuracies = np.zeros(n_points)
    min_accuracies = np.zeros(n_points)
    max_rmses = np.zeros(n_points)
    median_rmses = np.zeros(n_points)
    min_rmses = np.zeros(n_points)
    if hard_memb:
        max_rmses_hard = np.zeros(n_points)
        median_rmses_hard = np.zeros(n_points)
        min_rmses_hard = np.zeros(n_points)
    for point in range(0, n_points):
        with msg(f'Computing point {point}'):
            with msg('Getting posteriors'):
                if point > 0:
                    samples = g.sample(min(sample_n, g.item_count()**point), items_per=point, weight=weight)
                    posteriors = get_posteriors(liklihoods, users.training_ratings, samples, priors)
                else:
                    posteriors = np.full((1, user_n, priors.shape[0]), priors)

            if with_accuracy: 
                with msg('Getting max accuracy'): 
                    accuracies = get_accuracy(posteriors, users)
                    max_accuracies[point] = np.max(accuracies)
                    median_accuracies[point] = np.median(accuracies)
                    min_accuracies[point] = np.min(accuracies)

            if with_rmse: 
                with msg('Getting min RMSE'): 
                    rmses = get_rmse(posteriors, users, hard_memb=False)
                    max_rmses[point] = np.max(rmses)
                    median_rmses[point] = np.median(rmses)
                    min_rmses[point] = np.min(rmses)

                if hard_memb:
                    with msg('Getting min RMSE hard memeb'): 
                        rmses_hard = get_rmse(posteriors, users, hard_memb=True)
                        max_rmses_hard[point] = np.max(rmses_hard)
                        median_rmses_hard[point] = np.median(rmses_hard)
                        min_rmses_hard[point] = np.min(rmses_hard)

    if save_points:
        if label == 'sampling by pop':
            plabel = 'passive'
            if with_accuracy: 
                with msg('saving', accuracy_save_file(plabel, n_points)):
                    np.save(accuracy_save_file(plabel, n_points), median_accuracies)
            if with_rmse: 
                with msg('saving', rmse_save_file(plabel, n_points)):
                    np.save(rmse_save_file(plabel, n_points), median_rmses)
            if hard_memb and with_rmse:
                    with msg('saving', rmse_save_file(plabel, n_points, hard_memb=True)):
                        np.save(rmse_save_file(plabel, n_points, hard_memb=True), median_rmses_hard)
        else:
            if with_accuracy: 
                with msg('saving', accuracy_save_file(label, n_points)):
                    np.save(accuracy_save_file(label, n_points), max_accuracies)
            if with_rmse: 
                with msg('saving', rmse_save_file(label, n_points)):
                    np.save(rmse_save_file(label, n_points), min_rmses)
            if hard_memb and with_rmse: 
                with msg('saving', rmse_save_file(label, n_points, hard_memb=True)):
                    np.save(rmse_save_file(label, n_points, hard_memb=True), min_rmses_hard)


    if plot_spread:
        if with_accuracy: plot_accuracy_spread(min_accuracies, median_accuracies, max_accuracies, label)
        if with_rmse: plot_rmse_spread(min_rmses, median_rmses, max_rmses, users.test_data_rmse(), label, hard_memb=False)        
        if hard_memb:
            if with_rmse: plot_rmse_spread(min_rmses_hard, median_rmses_hard, max_rmses_hard, users.test_data_rmse(), label, hard_memb=True)        
    elif plot:
        if with_accuracy: plot_accuracy(max_accuracies, label, baseline)
        if with_rmse: plot_rmse(min_rmses, users.test_data_rmse(), label, baseline, hard_memb=False)
        if hard_memb:
            if with_rmse: plot_rmse(min_rmses_hard, users.test_data_rmse(), label, baseline, hard_memb=True)