Exemplos de filter_data em Python, exemplos de load_data.filter_data em Python

Exemplo n.º 1

0

Exibir arquivo

def main(restraints, x_axis, y_axis, one_graph=False):
    df, header = load()
    filt = filter_data(df, header, restraints)
    xs, ys, hs = extract_plotable_data(filt, header, x_axis, y_axis)

    if not (one_graph):
        for x, y, h in zip(xs, ys, hs):
            plot(x, y, x_axis, y_axis, h, header, one_graph)
    else:
        plot(xs, ys, x_axis, y_axis, hs, header, one_graph)

    return None

Exemplo n.º 2

0

Exibir arquivo

def find_best(restraints):
    """ Find the best config.  """
    # load data and get ready to search
    df, header = load()
    filt = filter_data(df, header, restraints)
    search = 'best_val_f1'
    loc = header[search]

    # search for max
    m = 0
    best = None
    for f in filt:
        if f[-1][loc] > m:
            m = f[-1][loc]
            best = f[-1]

    return gen_config(restraints, header, best)

Exemplo n.º 3

0

Exibir arquivo

def graph_error_bar(restraints, x_axis, y_axis, error_bar, diff):
    df, header = load()
    xs, ys, errors = [], [], []
    for r in restraints:
        filt = filter_data(df, header, r)
        x1, y1, _ = extract_plotable_data(filt, header, x_axis, y_axis)
        x2, y2, _ = extract_plotable_data(filt, header, x_axis, error_bar)

        y1, x1, max_len1 = multi_average(x1, y1, 'sample')
        y2, x2, max_len2 = multi_average(x2, y2, 'sample')

        # add desired data.
        xs.append(x1[-1])
        ys.append(y1[-1])
        errors.append(y2[-1])

    # xs, ys, errors
    plot_seaborn_bar(xs, ys, errors, x_axis, y_axis, diff)

Exemplo n.º 4

0

Exibir arquivo

def overlay_graph_diff(restraints, y_name, x_axis, y_axis, config='single'):
    """ Different restraints, same variables. Y_name should be an array (len 2) of names. """
    df, header = load()
    filts, xs, ys, hs = [], [], [], []
    for r in restraints:
        filt = filter_data(df, header, r)
        x, y, h = extract_plotable_data(filt, header, x_axis, y_axis)
        filts.append(filt)
        xs.append(x)
        ys.append(y)
        hs.append(h)

    if config == 'single':
        xs = [x[0] for x in xs]
        ys = [y[0] for y in ys]
        hs = [h[0] for h in hs]

    elif config == 'avg':
        new_xs, new_ys = [], []
        for x, y in zip(xs, ys):
            ty, tx, max_len = multi_average(x, y)
            new_xs.append(tx)
            new_ys.append(ty)

        xs, ys = new_xs, new_ys

    diff = [
        item for sublist in [[y_name[i]] * len(xs[i]) for i in range(len(xs))]
        for item in sublist
    ]
    diff_name = 'Experiment Variants'
    fx = []  # final x, final y
    fy = []
    for x, y in zip(xs, ys):
        fx.extend(x)
        fy.extend(y)

    plot_seaborn(fx, fy, diff, diff_name, x_axis, y_axis)

Exemplo n.º 5

0

Exibir arquivo

def overlay_graph(restraints, y_name, y_axis1, y_axis2, config='single'):
    """ Same restraints, different variables (i.e. token_auc + entropy). """
    df, header = load()
    filt = filter_data(df, header, restraints)
    x1, y1, h1 = extract_plotable_data(filt, header, 'epoch', y_axis1)
    x2, y2, h2 = extract_plotable_data(filt, header, 'epoch', y_axis2)

    if config == 'single':
        y1 = y1[0]
        y2 = y2[0]
        x1 = x1[0]
        x2 = x2[0]

    elif config == 'avg':
        y1 = list(np.mean(y1, axis=0))
        y2 = list(np.mean(y2, axis=0))

    diff = [y_axis1] * len(x1) + [y_axis2] * len(x2)
    x1 = x1 + x2
    y1 = y1 + y2
    plot_seaborn(x1, y1, diff, y_axis1 + " vs. " + y_axis2, 'epochs',
                 'combined')
    return None

Exemplo n.º 6

0

Exibir arquivo

def negative_epoch_graph(restraints,
                         diff,
                         y_name,
                         y_axis1,
                         y_axis2,
                         config='avg'):
    df, header = load()
    xs, ys, splits, dashed, colors = [], [], [], [], []
    color_order = [
        (0.00392156862745098, 0.45098039215686275, 0.6980392156862745),
        (0.00392156862745098, 0.45098039215686275, 0.6980392156862745),
        (0.00784313725490196, 0.6196078431372549, 0.45098039215686275),
        (0.00784313725490196, 0.6196078431372549, 0.45098039215686275),
    ]
    i = 0
    c_order = 0
    for r in restraints:
        filt = filter_data(df, header, r)
        x1, y1, _ = extract_plotable_data(filt, header, 'epoch', y_axis1)
        x2, y2, _ = extract_plotable_data(filt, header, 'epoch', y_axis2)

        if config == 'avg' or config == 'sample':
            if len(y1) != 0:

                y1, x1, max_len1 = multi_average(x1, y1, config)
                y2, x2, max_len2 = multi_average(x2, y2, config)

                # change value of x1 and x2
                x1 = list(np.asarray(x1) - len(x1))
                x2 = list(np.asarray(x2) - 1)

                # connect the lines
                x1.append(0.0)
                y1.append(y2[0])

                # set up data
                x = x1 + x2
                y = y1 + y2
                splits += [i] * (max_len1 + 1) + [i] * max_len2
                dashed.append(True)
                colors += [color_order[c_order]]
                i += 1
                c_order += 1

            else:
                y, x, max_len = multi_average(x2, y2, config)
                x = list(np.asarray(x) - 1)
                splits += [i] * max_len
                dashed.append(False)
                colors += [color_order[c_order]]
                i += 1
                c_order += 1

        # add data
        xs.append(x)
        ys.append(y)

    fx = []  # final x, final y
    fy = []
    for x, y in zip(xs, ys):
        fx.extend(x)
        fy.extend(y)

    plot_seaborn_neg(fx, fy, diff, splits, dashed, colors, 'epoch', y_axis1)
    return None

Exemplo n.º 7

0

Exibir arquivo

Arquivo: simulation.py Projeto: luctchak/GraphML

def simulation(number_of_user_to_test, number_of_it_per_user):
    data = load_data("../data/u.data")
    data = remove_duplicate(data)
    data = filter_data(data)

    cumulated_reward_random = np.zeros(num_films_to_recommend)
    cumulated_reward_dist = np.zeros(num_films_to_recommend)
    cumulated_reward_kmeans = np.zeros(num_films_to_recommend)

    RMSE_random = np.zeros(num_films_to_recommend)
    RMSE_dist = np.zeros(num_films_to_recommend)
    RMSE_kmeans = np.zeros(num_films_to_recommend)

    d = 4
    it_max = 10
    # find the number of unique users
    num_users = len(np.unique(data[:, 0]))
    # find the number of unique films
    num_items = len(np.unique(data[:, 1]))
    # Select a set of random user for simulation
    random_users_selected = []

    nb_it = 0
    while len(random_users_selected) < number_of_user_to_test:
        random_user = pick_random_user(data, num_users, minimum_number_of_films_rated)
        nb_it += 1
        if not (random_user in random_users_selected):
            random_users_selected.append(random_user)
        if nb_it > 1000:
            print 'probably not enough users. Lower the minimum_number_of_films_rated'
            break
    print "number_of_films_in_DB", num_items
    print "number_of_users_in_DB", num_users
    print "number_of_ratings    ", len(data[:, 0])
    print "random_users_selected", random_users_selected
    # initialisation of als
    als = ALS(d, num_users, num_items, "Users", "Movies", "Ratings", num_iters=20, verbose=True,lbda = 0.1,lbda2 = 0.1)

    for random_user_selected in random_users_selected:

        # Extract the list of films id for which we know the random user's ratings
        candidate_set = data[:, 1][data[:, 0] == random_user_selected]
        print "candidate_set", candidate_set
        # remove the user randomly selected from the DB
        R_dict = dict()
        R_dict["Users"] = data[:, 0][data[:, 0] != random_user_selected]
        R_dict["Movies"] = data[:, 1][data[:, 0] != random_user_selected]
        R_dict["Ratings"] = data[:, 2][data[:, 0] != random_user_selected]
        mean = data[:, 2][data[:, 0] != random_user_selected].mean()

        # Get the first decomposition R = U*V
        print "Fitting..."
        als.fit(R_dict)
        print "Done."
        print "Global RMSE", RMSE_total(als.U.dot(als.V.T), data)


        mem_train = als.train.copy()
        mem_U = als.U.copy()

        # Compute distance matrix of films based on V
        print "Building_film_graph..."
        distances = build_film_graph(als.V)
        print "Done."


        # Generate a graph from this distance matrix (G is fixed until the end)
        G = nx.from_numpy_matrix(distances)

        # Apply kmeans to get cluster assigment of films
        clusters_assignment = build_film_clusters(als.V)
        intermediate = data[data[:, 0] == random_user_selected]


        for recommendation_method in range(0, 3):
            for it in range(0, number_of_it_per_user):
                als.train = mem_train.copy()
                als.U = mem_U.copy()
                # init values
                ever_seen = []
                R_user = np.zeros(num_items)
                for i in range(0, num_films_to_recommend):
                    #    recommendation = suggest_one_film(G, R_user, ever_seen, candidate_set)
                    #    if recommendation == -1:
                    #        print 'we explored all the possible solutions'
                    #        break
                    # uncomment bellow to use recommendation with kmeans
                    if recommendation_method == 2:
                        recommendation = suggest_one_film(G, R_user, ever_seen, candidate_set)
                    if recommendation_method == 1:
                        recommendation = suggest_one_film_kmeans(clusters_assignment, R_user, ever_seen, candidate_set)
                    if recommendation_method == 0:
                        recommendation = suggest_one_film_random(R_user, ever_seen, candidate_set, it_max)

                    if recommendation == -1:
                        print 'we explored all the possible solutions'
                        break

                    recommendation = int(recommendation)
                    reward = intermediate[intermediate[:, 1] == recommendation][0, 2]
                    verbose = False


                    # update the RMSE

                    if recommendation_method == 2:
                        cumulated_reward_dist[i] += reward
                        RMSE_dist[i] += RMSE(R_user, candidate_set, intermediate, mean, verbose)
                    if recommendation_method == 1:
                        cumulated_reward_kmeans[i] += reward
                        RMSE_kmeans[i] += RMSE(R_user, candidate_set, intermediate, mean, verbose)
                    if recommendation_method == 0:
                        cumulated_reward_random[i] += reward
                        RMSE_random[i] += RMSE(R_user, candidate_set, intermediate, mean, verbose)

                    ever_seen.append(recommendation)
                    # add the value to train
                    als.train[random_user_selected, recommendation] = reward
                    # get the indices
                    indices = als.train[random_user_selected].nonzero()[1]

                    # update R_u
                    R_u = als.train[random_user_selected, indices]
                    Hix = als.V[indices, :]
                    HH = Hix.T.dot(Hix)
                    M = HH + np.diag(als.lbda*len(R_u.toarray().T)*np.ones(als.d))
                    als.U[random_user_selected, :] = np.linalg.solve(M, Hix.T.dot(R_u.toarray().T)).reshape(als.d)
                    for i in candidate_set:
                        R_user[i] = als.U[random_user_selected, :].dot(als.V[i, :].T)

                if verbose:
                    print '\n'
                    print '='*40
                    print "recommendation_method", recommendation_method
                    print "it", it
                    print "R_user[ever_seen]", R_user[ever_seen]
                    print '='*40

    # makes it cumulative
    for i in range(1, num_films_to_recommend):
        cumulated_reward_random[i] += cumulated_reward_random[i-1]
        cumulated_reward_dist[i] += cumulated_reward_dist[i-1]
        cumulated_reward_kmeans[i] += cumulated_reward_kmeans[i-1]

    # normalize
    cumulated_reward_random /= number_of_it_per_user*number_of_user_to_test
    cumulated_reward_dist /= number_of_it_per_user*number_of_user_to_test
    cumulated_reward_kmeans /= number_of_it_per_user*number_of_user_to_test
    RMSE_dist /= number_of_it_per_user*number_of_user_to_test
    RMSE_kmeans /= number_of_it_per_user*number_of_user_to_test
    RMSE_random /= number_of_it_per_user*number_of_user_to_test

    print "RMSE_dist", RMSE_dist
    print "RMSE_kmeans", RMSE_kmeans
    print "RMSE_random", RMSE_random

    #plt.ion()
    plt.plot(range(0, num_films_to_recommend), cumulated_reward_random, 'r', label='random')  # plotting t,a separately
    plt.plot(range(0, num_films_to_recommend), cumulated_reward_dist, 'b', label='dist')    # plotting t,b separately
    plt.plot(range(0, num_films_to_recommend), cumulated_reward_kmeans, 'g', label='k-means')  # plotting t,c separately
    plt.legend(loc=0)
    plt.title('cumulative reward')
    plt.figure()
    plt.plot(range(1, num_films_to_recommend), RMSE_random[1: num_films_to_recommend], 'r', label='random')  # plotting t,a separately
    plt.plot(range(1, num_films_to_recommend), RMSE_dist[1: num_films_to_recommend], 'b', label='dist')    # plotting t,b separately
    plt.plot(range(1, num_films_to_recommend), RMSE_kmeans[1: num_films_to_recommend], 'g', label='k-means')  # plotting t,c separately
    plt.legend(loc=0)
    plt.title('RMSE')
    plt.show()

    return 0