def main(restraints, x_axis, y_axis, one_graph=False): df, header = load() filt = filter_data(df, header, restraints) xs, ys, hs = extract_plotable_data(filt, header, x_axis, y_axis) if not (one_graph): for x, y, h in zip(xs, ys, hs): plot(x, y, x_axis, y_axis, h, header, one_graph) else: plot(xs, ys, x_axis, y_axis, hs, header, one_graph) return None
def find_best(restraints): """ Find the best config. """ # load data and get ready to search df, header = load() filt = filter_data(df, header, restraints) search = 'best_val_f1' loc = header[search] # search for max m = 0 best = None for f in filt: if f[-1][loc] > m: m = f[-1][loc] best = f[-1] return gen_config(restraints, header, best)
def graph_error_bar(restraints, x_axis, y_axis, error_bar, diff): df, header = load() xs, ys, errors = [], [], [] for r in restraints: filt = filter_data(df, header, r) x1, y1, _ = extract_plotable_data(filt, header, x_axis, y_axis) x2, y2, _ = extract_plotable_data(filt, header, x_axis, error_bar) y1, x1, max_len1 = multi_average(x1, y1, 'sample') y2, x2, max_len2 = multi_average(x2, y2, 'sample') # add desired data. xs.append(x1[-1]) ys.append(y1[-1]) errors.append(y2[-1]) # xs, ys, errors plot_seaborn_bar(xs, ys, errors, x_axis, y_axis, diff)
def overlay_graph_diff(restraints, y_name, x_axis, y_axis, config='single'): """ Different restraints, same variables. Y_name should be an array (len 2) of names. """ df, header = load() filts, xs, ys, hs = [], [], [], [] for r in restraints: filt = filter_data(df, header, r) x, y, h = extract_plotable_data(filt, header, x_axis, y_axis) filts.append(filt) xs.append(x) ys.append(y) hs.append(h) if config == 'single': xs = [x[0] for x in xs] ys = [y[0] for y in ys] hs = [h[0] for h in hs] elif config == 'avg': new_xs, new_ys = [], [] for x, y in zip(xs, ys): ty, tx, max_len = multi_average(x, y) new_xs.append(tx) new_ys.append(ty) xs, ys = new_xs, new_ys diff = [ item for sublist in [[y_name[i]] * len(xs[i]) for i in range(len(xs))] for item in sublist ] diff_name = 'Experiment Variants' fx = [] # final x, final y fy = [] for x, y in zip(xs, ys): fx.extend(x) fy.extend(y) plot_seaborn(fx, fy, diff, diff_name, x_axis, y_axis)
def overlay_graph(restraints, y_name, y_axis1, y_axis2, config='single'): """ Same restraints, different variables (i.e. token_auc + entropy). """ df, header = load() filt = filter_data(df, header, restraints) x1, y1, h1 = extract_plotable_data(filt, header, 'epoch', y_axis1) x2, y2, h2 = extract_plotable_data(filt, header, 'epoch', y_axis2) if config == 'single': y1 = y1[0] y2 = y2[0] x1 = x1[0] x2 = x2[0] elif config == 'avg': y1 = list(np.mean(y1, axis=0)) y2 = list(np.mean(y2, axis=0)) diff = [y_axis1] * len(x1) + [y_axis2] * len(x2) x1 = x1 + x2 y1 = y1 + y2 plot_seaborn(x1, y1, diff, y_axis1 + " vs. " + y_axis2, 'epochs', 'combined') return None
def negative_epoch_graph(restraints, diff, y_name, y_axis1, y_axis2, config='avg'): df, header = load() xs, ys, splits, dashed, colors = [], [], [], [], [] color_order = [ (0.00392156862745098, 0.45098039215686275, 0.6980392156862745), (0.00392156862745098, 0.45098039215686275, 0.6980392156862745), (0.00784313725490196, 0.6196078431372549, 0.45098039215686275), (0.00784313725490196, 0.6196078431372549, 0.45098039215686275), ] i = 0 c_order = 0 for r in restraints: filt = filter_data(df, header, r) x1, y1, _ = extract_plotable_data(filt, header, 'epoch', y_axis1) x2, y2, _ = extract_plotable_data(filt, header, 'epoch', y_axis2) if config == 'avg' or config == 'sample': if len(y1) != 0: y1, x1, max_len1 = multi_average(x1, y1, config) y2, x2, max_len2 = multi_average(x2, y2, config) # change value of x1 and x2 x1 = list(np.asarray(x1) - len(x1)) x2 = list(np.asarray(x2) - 1) # connect the lines x1.append(0.0) y1.append(y2[0]) # set up data x = x1 + x2 y = y1 + y2 splits += [i] * (max_len1 + 1) + [i] * max_len2 dashed.append(True) colors += [color_order[c_order]] i += 1 c_order += 1 else: y, x, max_len = multi_average(x2, y2, config) x = list(np.asarray(x) - 1) splits += [i] * max_len dashed.append(False) colors += [color_order[c_order]] i += 1 c_order += 1 # add data xs.append(x) ys.append(y) fx = [] # final x, final y fy = [] for x, y in zip(xs, ys): fx.extend(x) fy.extend(y) plot_seaborn_neg(fx, fy, diff, splits, dashed, colors, 'epoch', y_axis1) return None
def simulation(number_of_user_to_test, number_of_it_per_user): data = load_data("../data/u.data") data = remove_duplicate(data) data = filter_data(data) cumulated_reward_random = np.zeros(num_films_to_recommend) cumulated_reward_dist = np.zeros(num_films_to_recommend) cumulated_reward_kmeans = np.zeros(num_films_to_recommend) RMSE_random = np.zeros(num_films_to_recommend) RMSE_dist = np.zeros(num_films_to_recommend) RMSE_kmeans = np.zeros(num_films_to_recommend) d = 4 it_max = 10 # find the number of unique users num_users = len(np.unique(data[:, 0])) # find the number of unique films num_items = len(np.unique(data[:, 1])) # Select a set of random user for simulation random_users_selected = [] nb_it = 0 while len(random_users_selected) < number_of_user_to_test: random_user = pick_random_user(data, num_users, minimum_number_of_films_rated) nb_it += 1 if not (random_user in random_users_selected): random_users_selected.append(random_user) if nb_it > 1000: print 'probably not enough users. Lower the minimum_number_of_films_rated' break print "number_of_films_in_DB", num_items print "number_of_users_in_DB", num_users print "number_of_ratings ", len(data[:, 0]) print "random_users_selected", random_users_selected # initialisation of als als = ALS(d, num_users, num_items, "Users", "Movies", "Ratings", num_iters=20, verbose=True,lbda = 0.1,lbda2 = 0.1) for random_user_selected in random_users_selected: # Extract the list of films id for which we know the random user's ratings candidate_set = data[:, 1][data[:, 0] == random_user_selected] print "candidate_set", candidate_set # remove the user randomly selected from the DB R_dict = dict() R_dict["Users"] = data[:, 0][data[:, 0] != random_user_selected] R_dict["Movies"] = data[:, 1][data[:, 0] != random_user_selected] R_dict["Ratings"] = data[:, 2][data[:, 0] != random_user_selected] mean = data[:, 2][data[:, 0] != random_user_selected].mean() # Get the first decomposition R = U*V print "Fitting..." als.fit(R_dict) print "Done." print "Global RMSE", RMSE_total(als.U.dot(als.V.T), data) mem_train = als.train.copy() mem_U = als.U.copy() # Compute distance matrix of films based on V print "Building_film_graph..." distances = build_film_graph(als.V) print "Done." # Generate a graph from this distance matrix (G is fixed until the end) G = nx.from_numpy_matrix(distances) # Apply kmeans to get cluster assigment of films clusters_assignment = build_film_clusters(als.V) intermediate = data[data[:, 0] == random_user_selected] for recommendation_method in range(0, 3): for it in range(0, number_of_it_per_user): als.train = mem_train.copy() als.U = mem_U.copy() # init values ever_seen = [] R_user = np.zeros(num_items) for i in range(0, num_films_to_recommend): # recommendation = suggest_one_film(G, R_user, ever_seen, candidate_set) # if recommendation == -1: # print 'we explored all the possible solutions' # break # uncomment bellow to use recommendation with kmeans if recommendation_method == 2: recommendation = suggest_one_film(G, R_user, ever_seen, candidate_set) if recommendation_method == 1: recommendation = suggest_one_film_kmeans(clusters_assignment, R_user, ever_seen, candidate_set) if recommendation_method == 0: recommendation = suggest_one_film_random(R_user, ever_seen, candidate_set, it_max) if recommendation == -1: print 'we explored all the possible solutions' break recommendation = int(recommendation) reward = intermediate[intermediate[:, 1] == recommendation][0, 2] verbose = False # update the RMSE if recommendation_method == 2: cumulated_reward_dist[i] += reward RMSE_dist[i] += RMSE(R_user, candidate_set, intermediate, mean, verbose) if recommendation_method == 1: cumulated_reward_kmeans[i] += reward RMSE_kmeans[i] += RMSE(R_user, candidate_set, intermediate, mean, verbose) if recommendation_method == 0: cumulated_reward_random[i] += reward RMSE_random[i] += RMSE(R_user, candidate_set, intermediate, mean, verbose) ever_seen.append(recommendation) # add the value to train als.train[random_user_selected, recommendation] = reward # get the indices indices = als.train[random_user_selected].nonzero()[1] # update R_u R_u = als.train[random_user_selected, indices] Hix = als.V[indices, :] HH = Hix.T.dot(Hix) M = HH + np.diag(als.lbda*len(R_u.toarray().T)*np.ones(als.d)) als.U[random_user_selected, :] = np.linalg.solve(M, Hix.T.dot(R_u.toarray().T)).reshape(als.d) for i in candidate_set: R_user[i] = als.U[random_user_selected, :].dot(als.V[i, :].T) if verbose: print '\n' print '='*40 print "recommendation_method", recommendation_method print "it", it print "R_user[ever_seen]", R_user[ever_seen] print '='*40 # makes it cumulative for i in range(1, num_films_to_recommend): cumulated_reward_random[i] += cumulated_reward_random[i-1] cumulated_reward_dist[i] += cumulated_reward_dist[i-1] cumulated_reward_kmeans[i] += cumulated_reward_kmeans[i-1] # normalize cumulated_reward_random /= number_of_it_per_user*number_of_user_to_test cumulated_reward_dist /= number_of_it_per_user*number_of_user_to_test cumulated_reward_kmeans /= number_of_it_per_user*number_of_user_to_test RMSE_dist /= number_of_it_per_user*number_of_user_to_test RMSE_kmeans /= number_of_it_per_user*number_of_user_to_test RMSE_random /= number_of_it_per_user*number_of_user_to_test print "RMSE_dist", RMSE_dist print "RMSE_kmeans", RMSE_kmeans print "RMSE_random", RMSE_random #plt.ion() plt.plot(range(0, num_films_to_recommend), cumulated_reward_random, 'r', label='random') # plotting t,a separately plt.plot(range(0, num_films_to_recommend), cumulated_reward_dist, 'b', label='dist') # plotting t,b separately plt.plot(range(0, num_films_to_recommend), cumulated_reward_kmeans, 'g', label='k-means') # plotting t,c separately plt.legend(loc=0) plt.title('cumulative reward') plt.figure() plt.plot(range(1, num_films_to_recommend), RMSE_random[1: num_films_to_recommend], 'r', label='random') # plotting t,a separately plt.plot(range(1, num_films_to_recommend), RMSE_dist[1: num_films_to_recommend], 'b', label='dist') # plotting t,b separately plt.plot(range(1, num_films_to_recommend), RMSE_kmeans[1: num_films_to_recommend], 'g', label='k-means') # plotting t,c separately plt.legend(loc=0) plt.title('RMSE') plt.show() return 0