def calculate_metrics(df):
    """ Calculates metrics at different k (1 to 10 + 20,30,40,50)"""
    #print(df.columns)
    klist = list(range(1, 11))
    klist.extend([20, 30, 40, 50, 100, 200, 300, 500])
    print(klist)
    # 14 x 3 x 4 columns added for each
    for k in tqdm(klist):
        df['average_precision_p2v_{}'.format(k)] = df['p2v_binary'].progress_apply(lambda x: average_precision(x, k))
        df['average_precision_d2v_{}'.format(k)] = df['d2v_binary'].progress_apply(lambda x: average_precision(x, k))
        df['recall_p2v_{}'.format(k)] = df[['p2v_binary', 'ground_truth']].apply(
            lambda x: recall_at_k(x.p2v_binary, x.ground_truth, k), axis=1)   
        df['recall_d2v_{}'.format(k)] = df[['d2v_binary', 'ground_truth']].progress_apply(
            lambda x: recall_at_k(x.d2v_binary, x.ground_truth, k), axis=1)    
        df['reciprocal_rank_p2v_{}'.format(k)] = df['p2v_binary'].progress_apply(lambda x: reciprocal_rank(x, k))
        df['reciprocal_rank_d2v_{}'.format(k)] = df['d2v_binary'].progress_apply(lambda x: reciprocal_rank(x, k))
        df['ndcg_p2v_{}'.format(k)] = df['p2v_binary'].progress_apply(lambda x: ndcg(x, k))
        df['ndcg_d2v_{}'.format(k)] = df['d2v_binary'].progress_apply(lambda x: ndcg(x, k))
    #df.to_csv('/home/ashwath/Programs/UnpaywallMAG/Evaluation/paper2vec_unpaywall_500.tsv', sep='\t')
    df.to_pickle('/home/ashwath/Programs/UnpaywallMAG/Pickles/paperwisemetrics_unpaywall_p2v_d2v_df_may21.pickle')
    print("METRICS CALCULATED, time to calculate the means")
    # Get the mean of all the index columns
    df = df.drop(['p2v_recommendations', 'p2v_binary', 'd2v_recommendations', 'd2v_binary', 'ground_truth'], axis=1)
    mean_series = df.mean()
    mean_series.to_csv('/home/ashwath/Programs/UnpaywallMAG/Evaluation/meanmetrics_p2v_d2v_may21.tsv', sep='\t', index=True, header=False)
    print("C'est fini.")
示例#2
0
            def dev_step():
                acc = []
                losses = []
                pred_scores = []
                ture_scores = []
                count = 0
                while True:
                    try:
                        feed_dict = {
                            handle: test_handle,
                            model.dropout_keep_prob: 1.0
                        }
                        step, loss, accuracy, y_pred, target = sess.run([
                            global_step, model.loss, model.accuracy,
                            model.y_pred, model.target
                        ], feed_dict)

                        acc.append(accuracy)
                        losses.append(loss)
                        pred_scores += list(y_pred[:, 1])
                        ture_scores += list(target)

                        count += 1
                        if count % 1000 == 0:
                            print("Processing {} samples".format(count))
                    except tf.errors.OutOfRangeError:
                        break

                MeanAcc = sum(acc) / len(acc)
                MeanLoss = sum(losses) / len(losses)

                with open(
                        os.path.join(out_dir,
                                     'predScores-iter-%s-test.txt' % (step)),
                        'w') as f:
                    for score1, score2 in zip(pred_scores, ture_scores):
                        f.writelines(str(score1) + '\t' + str(score2) + '\n')

                num_sample = int(len(pred_scores) / 10)
                score_list = np.split(np.array(pred_scores),
                                      num_sample,
                                      axis=0)
                recall_2_1 = recall_2at1(score_list, k=1)

                recall_at_1 = recall_at_k(np.array(ture_scores),
                                          np.array(pred_scores), 1)
                recall_at_2 = recall_at_k(np.array(ture_scores),
                                          np.array(pred_scores), 2)
                recall_at_5 = recall_at_k(np.array(ture_scores),
                                          np.array(pred_scores), 5)
                time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                print("**********************************")
                print('pred_scores: ', len(pred_scores))
                print("recall_2_1:  %.3f" % (recall_2_1))
                print("recall_at_1: %.3f" % (recall_at_1))
                print("recall_at_2: %.3f" % (recall_at_2))
                print("recall_at_5: %.3f" % (recall_at_5))
                print("**********************************")
示例#3
0
            def dev_step():
                acc = []
                losses = []
                pred_scores = []
                ture_scores = []
                count = 0
                while True:
                    try:
                        feed_dict = {
                            handle: test_handle,
                            model.is_training: False,
                            model.dropout_keep_prob: 1.0
                        }
                        step, loss, accuracy, y_pred, target = sess.run([
                            global_step, model.loss, model.accuracy,
                            model.y_pred, model.target
                        ], feed_dict)
                        acc.append(accuracy)
                        losses.append(loss)
                        pred_scores += list(y_pred[:, 1])
                        ture_scores += list(target)

                        count += 1
                        if count % 1000 == 0:
                            print(count)
                    except tf.errors.OutOfRangeError:
                        break

                MeanAcc = sum(acc) / len(acc)
                MeanLoss = sum(losses) / len(losses)

                if ('ubuntu' in FLAGS.data_path):
                    num_sample = int(len(pred_scores) / 10)
                    score_list = np.split(np.array(pred_scores),
                                          num_sample,
                                          axis=0)
                    recall_2_1 = recall_2at1(score_list, k=1)

                    recall_at_1 = recall_at_k(np.array(ture_scores),
                                              np.array(pred_scores), 1)
                    recall_at_2 = recall_at_k(np.array(ture_scores),
                                              np.array(pred_scores), 2)
                    recall_at_5 = recall_at_k(np.array(ture_scores),
                                              np.array(pred_scores), 5)
                    time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    print("**********************************")
                    print("%s results.........." % (flag.title()))
                    print('pred_scores: ', len(pred_scores))
                    print("Step: %d \t| loss: %.3f \t| acc: %.3f \t|  %s" %
                          (step, MeanLoss, MeanAcc, time_str))
                    print("recall_2_1:  %.3f" % (recall_2_1))
                    print("recall_at_1: %.3f" % (recall_at_1))
                    print("recall_at_2: %.3f" % (recall_at_2))
                    print("recall_at_5: %.3f" % (recall_at_5))
                    print("**********************************")
示例#4
0
def topk_eval(sess, args, user_path_top_k, model, user_list, train_record, eval_record, test_record, item_set, k_list, batch_size, mode = 'test'):
    precision_list = {k: [] for k in k_list}
    recall_list = {k: [] for k in k_list}
    MAP_list = {k: [] for k in k_list}
    hit_ratio_list = {k: [] for k in k_list}
    ndcg_list = {k: [] for k in k_list}

    for user in user_list:
        if mode == 'eval': ref_user = eval_record
        else: ref_user = test_record
        if user in ref_user:

            test_item_list = list(item_set - train_record[user])
            # test_item_list = test_item_list[:1000]
            item_score_map = dict()
            start = 0
            while start + batch_size <= len(test_item_list):
                items, scores = model.get_scores(sess, {model.user_indices: [user] * batch_size,
                                                    model.item_indices: test_item_list[start:start + batch_size]})

                for item, score in zip(items, scores):
                    item_score_map[item] = score
                start += batch_size

            # padding the last incomplete minibatch if exists
            if start < len(test_item_list):
                items, scores = model.get_scores(
                    sess, {model.user_indices: [user] * batch_size,
                           model.item_indices: test_item_list[start:] + [test_item_list[-1]] * (
                                   batch_size - len(test_item_list) + start)})            

                for item, score in zip(items, scores):
                    item_score_map[item] = score

            item_score_pair_sorted = sorted(item_score_map.items(), key=lambda x: x[1], reverse=True)
            item_sorted = [i[0] for i in item_score_pair_sorted]

            for k in k_list:
                recall_list[k].append(recall_at_k(item_sorted,ref_user[user],k))

            # ndcg
            r_hit = []
            for i in item_sorted[:k]:
                if i in ref_user[user]:
                    r_hit.append(1)
                else:
                    r_hit.append(0)
            for k in k_list:
                ndcg_list[k].append(ndcg_at_k(r_hit,k))

    precision = [np.mean(precision_list[k]) for k in k_list]
    recall = [np.mean(recall_list[k]) for k in k_list]
    ndcg = [np.mean(ndcg_list[k]) for k in k_list]
    # MAP = [np.mean(MAP_list[k]) for k in k_list]
    # hit_ratio = [np.mean(hit_ratio_list[k]) for k in k_list]

    return precision, recall, ndcg, None, None
示例#5
0
    def metric_test(self):
        """ uses tensorrec eval as benchmark for rating performance of various reco algorithms """
        k = 10
        latent_factor = 10
        n_users = 10
        n_items = 12

        interactions, user_features, item_features = util.generate_dummy_data_with_indicator(
            num_users=n_users, num_items=n_items, interaction_density=.5)
        print("interactiosn shape={}".format(np.shape(interactions)))
        print("user features shape={}".format(np.shape(
            user_features.toarray())))
        print("item features shape={}".format(np.shape(
            item_features.toarray())))

        model = TensorRec(n_components=latent_factor)

        model.fit(interactions, user_features, item_features, epochs=19)

        ranks = model.predict_rank(user_features=user_features,
                                   item_features=item_features)

        print("Ranks shape={}".format(np.shape(ranks)))

        self.assertTrue(np.shape(interactions) == np.shape(ranks))

        tr_recall_result = eval.recall_at_k(predicted_ranks=ranks,
                                            test_interactions=interactions,
                                            k=k,
                                            preserve_rows=False)
        # print (tr_recall_result.mean())

        tr_precision_result = eval.precision_at_k(
            predicted_ranks=ranks,
            test_interactions=interactions,
            k=k,
            preserve_rows=False)
        # print(tr_precision_result.mean())

        # we need csr for interactions data
        interactions_ = interactions.tocsr()
        recall_result = metrics.recall_at_k(ranks,
                                            interactions_,
                                            k=k,
                                            preserve_rows=False)
        # print(recall_result.mean())

        precision_result = metrics.precision_at_k(ranks,
                                                  interactions_,
                                                  k=k,
                                                  preserve_rows=False)
        # print (precision_result.mean())

        self.assertTrue(tr_recall_result.mean() == recall_result.mean())
        self.assertTrue(tr_precision_result.mean() == precision_result.mean())
示例#6
0
def get_performance(user_pos_test, r, auc, Ks):
    precision, recall, ndcg, hit_ratio = [], [], [], []

    for K in Ks:
        precision.append(metrics.precision_at_k(r, K))
        recall.append(metrics.recall_at_k(r, K, len(user_pos_test)))
        ndcg.append(metrics.ndcg_at_k(r, K))
        hit_ratio.append(metrics.hit_at_k(r, K))

    return {
        'recall': np.array(recall),
        'precision': np.array(precision),
        'ndcg': np.array(ndcg),
        'hit_ratio': np.array(hit_ratio),
        'auc': auc
    }
示例#7
0
            def dev_step(flag, writer):
                sess.run(valid_iterator.initializer)
                valid_handle = sess.run(valid_iterator.string_handle())
                acc = []
                losses = []
                pred_scores = []
                ture_scores = []
                count = 0
                while True:
                    try:
                        feed_dict = {
                            handle: valid_handle,
                            model.is_training: False,
                            model.dropout_keep_prob: 1.0
                        }
                        step, loss, accuracy, y_pred, target = sess.run([
                            global_step, model.loss, model.accuracy,
                            model.y_pred, model.target
                        ], feed_dict)
                        acc.append(accuracy)
                        losses.append(loss)
                        pred_scores += list(y_pred[:, 1])
                        ture_scores += list(target)

                        count += 1
                        if count % 1000 == 0:
                            print(count)
                    except tf.errors.OutOfRangeError:
                        break

                MeanAcc = sum(acc) / len(acc)
                MeanLoss = sum(losses) / len(losses)

                with open(
                        os.path.join(out_dir,
                                     'predScores-iter-%s.txt' % (step)),
                        'w') as f:
                    for score1, score2 in zip(pred_scores, ture_scores):
                        f.writelines(str(score1) + '\t' + str(score2) + '\n')

                summary_MeanLoss = tf.Summary(value=[
                    tf.Summary.Value(tag='%s/MeanLoss' % (flag),
                                     simple_value=MeanLoss)
                ])
                summary_MeanAcc = tf.Summary(value=[
                    tf.Summary.Value(tag='%s/MeanAcc' % (flag),
                                     simple_value=MeanAcc)
                ])
                writer.add_summary(summary_MeanLoss, step)
                writer.add_summary(summary_MeanAcc, step)

                if ('ubuntu' in FLAGS.data_path):
                    num_sample = int(len(pred_scores) / 10)
                    score_list = np.split(np.array(pred_scores),
                                          num_sample,
                                          axis=0)
                    recall_2_1 = recall_2at1(score_list, k=1)

                    recall_at_1 = recall_at_k(np.array(ture_scores),
                                              np.array(pred_scores), 1)
                    recall_at_2 = recall_at_k(np.array(ture_scores),
                                              np.array(pred_scores), 2)
                    recall_at_5 = recall_at_k(np.array(ture_scores),
                                              np.array(pred_scores), 5)
                    time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    print("**********************************")
                    print("%s results.........." % (flag.title()))
                    print('pred_scores: ', len(pred_scores))
                    print("Step: %d \t| loss: %.3f \t| acc: %.3f \t|  %s" %
                          (step, MeanLoss, MeanAcc, time_str))
                    print("recall_2_1:  %.3f" % (recall_2_1))
                    print("recall_at_1: %.3f" % (recall_at_1))
                    print("recall_at_2: %.3f" % (recall_at_2))
                    print("recall_at_5: %.3f" % (recall_at_5))
                    print("**********************************")

                    summary_recall_2_1 = tf.Summary(value=[
                        tf.Summary.Value(tag='%s/recall_2_1' % (flag),
                                         simple_value=recall_2_1)
                    ])
                    summary_recall_at_1 = tf.Summary(value=[
                        tf.Summary.Value(tag='%s/recall_at_1' % (flag),
                                         simple_value=recall_at_1)
                    ])
                    summary_recall_at_2 = tf.Summary(value=[
                        tf.Summary.Value(tag='%s/recall_at_2' % (flag),
                                         simple_value=recall_at_2)
                    ])
                    summary_recall_at_5 = tf.Summary(value=[
                        tf.Summary.Value(tag='%s/recall_at_5' % (flag),
                                         simple_value=recall_at_5)
                    ])

                    writer.add_summary(summary_recall_2_1, step)
                    writer.add_summary(summary_recall_at_1, step)
                    writer.add_summary(summary_recall_at_2, step)
                    writer.add_summary(summary_recall_at_5, step)
                    return MeanLoss, recall_2_1 + recall_at_1
示例#8
0
def main():
    print("\nStarting '%s'" % sys.argv[0])

    np.random.seed(8000)
    """ Load dataset """
    datafile = "./data/ml-100k/u.data"
    data = pd.read_csv(datafile,
                       sep='\t',
                       names=["userid", "itemid", "rating", "timestamp"])
    """ Convert rating data to n_user x n_item matrix format """
    data = data.sort_values(by=["userid", "itemid"])
    ratings = pd.pivot_table(data,
                             values="rating",
                             index="userid",
                             columns="itemid")
    ratings.fillna(0, inplace=True)

    users = np.unique(ratings.index.values)
    items = np.unique(ratings.columns.values)
    n_users = len(users)
    n_items = len(items)
    print("n_users=%d n_items=%d" % (n_users, n_items))
    """ Take the mean only from non-zero elements """
    temp = ratings.copy()
    rating_mean = temp.copy().replace(0, np.NaN).mean().mean()
    rating_mean = 3.5 if rating_mean > 3.5 else rating_mean
    print("Rating mean:%.3f" % rating_mean)
    """ Find PQ sub matrices """
    R = ratings.values
    """ Randomly initialize P & Q matrices with n latent factors """
    n_factors = 10
    P = np.random.normal(0, .1, (n_users, n_factors))
    Q = np.random.normal(0, .1, (n_factors, n_items))

    R_mask = R.copy()
    R_mask[R_mask != 0.000000] = 1
    R_hat = np.zeros(np.shape(R))
    R_hat_mask = np.zeros(np.shape(R))
    np.matmul(P, Q, out=R_hat)

    # Get errors only from explicitly rated elements
    np.multiply(R_hat, R_mask, out=R_hat_mask)
    """ Compute error: MSE = (1/N) * (R - R_hat), RMSE = MSE^(1/2) """
    diff = np.subtract(R, R_hat_mask)
    diff_square = np.square(diff)
    mse = np.divide(diff_square.sum(), n_users * n_items)
    rmse = np.sqrt(mse)

    print("RMSE: %.5f" % (rmse))

    print("Type: ", type(R_hat))
    print(R_hat[:5, :10])
    predicted_ranks = metric.rank_matrix(R_hat)
    print(predicted_ranks.shape)
    print(predicted_ranks[:5, :10])

    ratings_csr = sparse.csr_matrix(ratings.values)
    precision = metrics.precision_at_k(predicted_ranks, ratings_csr, k=100)
    recall = metrics.recall_at_k(predicted_ranks, ratings_csr, k=100)

    print("Precision {0:.5f}% Recall={1:.5f}%".format(precision * 100,
                                                      recall * 100))

    print("\nStopping '%s'" % sys.argv[0])
示例#9
0
def main():
    print("\nStarting '%s'" % sys.argv[0])

    np.random.seed(8000)
    normalization_enabled = False
    optimize_enabled = True
    k = 100
    """ Load dataset """
    datafile = "./data/ml-100k/u.data"
    data = pd.read_csv(datafile,
                       sep='\t',
                       names=["userid", "itemid", "rating", "timestamp"])
    """ Convert rating data to user x movie matrix format """
    data = data.sort_values(by=["userid", "itemid"])
    ratings = pd.pivot_table(data,
                             values="rating",
                             index="userid",
                             columns="itemid")
    ratings.fillna(0, inplace=True)
    """ Construct data """
    users = np.unique(ratings.index.values)
    items = np.unique(ratings.columns.values)
    n_users = len(users)
    n_items = len(items)
    print("n_users=%d n_items=%d" % (n_users, n_items))
    """ Compute mean ratingonly from non-zero elements """
    temp = ratings.copy()
    rating_mean = temp.copy().replace(0, np.NaN).mean().mean()
    rating_mean = 3.5 if rating_mean > 3.5 else rating_mean
    print("Rating mean: %.6f" % rating_mean)

    R_mask = np.zeros(np.shape(ratings))
    R_mask[ratings != 0.000000] = 1

    if normalization_enabled:
        temp = ratings.copy()
        ratings_norm = np.subtract(temp, rating_mean, where=temp != 0)
        ratings_norm = np.multiply(ratings_norm, R_mask)
        assert (np.count_nonzero(ratings_norm) == np.count_nonzero(ratings))
        R = ratings_norm.values
    else:
        R = ratings.values.copy()

    # Setup covariance to treat the item columns as input variables
    covar = np.cov(R, rowvar=False)
    evals, evecs = np.linalg.eigh(covar)

    print("cov_mat shape: %s" % str(np.shape(covar)))
    print("evals shape: %s" % str(np.shape(evals)))
    print("evecs shape: %s" % str(np.shape(evecs)))

    n_components = 10  # principal components
    """ Randomly initialize weights table """
    weights = np.random.normal(0, .1, (n_users, n_components))
    components = evecs[:n_components, :n_items]

    R_hat_mask = np.zeros(np.shape(R), dtype=np.float64)

    if optimize_enabled:
        # optimization parameters
        epochs = 5
        learning_rate = .0001
        lambda_ = .0001
        verbosity = 1
        print("Optimized PCA epochs=%s" % epochs)
        """ We only modify the weight matrix """
        for epoch in range(epochs):
            for u in range(n_users):
                for i in range(n_items):
                    error = R[u, i] - np.dot(weights[u, :], components[:, i])
                    for k in range(n_components):
                        weights[u, k] = weights[u, k] - learning_rate * (
                            error * -2 * components[k, i] + lambda_ *
                            (2 * np.abs(weights[u, k]) +
                             2 * np.abs(components[k, i])))

            R_hat = np.zeros(np.shape(R))
            np.matmul(weights, components, out=R_hat)
            # Get errors only from explicitly rated elements
            np.multiply(R_hat, R_mask, out=R_hat_mask)
            # Compute error: MSE = (1/N) * (R - Rˆ), RMSE = MSEˆ(1/2)
            diff = np.subtract(R, R_hat_mask)
            diff_square = np.square(diff)
            mse = np.divide(diff_square.sum(), np.count_nonzero(R))
            rmse = np.sqrt(mse)
            if epoch % verbosity == 0 or epoch == (epochs - 1):
                print("Epoch %d: RMSE: %.6f" % (epoch, rmse))
    else:
        R_hat = np.matmul(weights, components)
        print("R_hat shape: %s" % str(np.shape(R_hat)))
        assert (np.shape(R) == np.shape(R_hat))

        print("PCA single run")
        np.multiply(R_hat, R_mask, out=R_hat_mask)
        # Compute error: MSE = (1/N) * (R - Rˆ), RMSE = MSEˆ(1/2)
        diff = np.subtract(R, R_hat_mask)
        diff_square = np.square(diff)
        mse = np.divide(diff_square.sum(), np.count_nonzero(R))
        rmse = np.sqrt(mse)
        print("RMSE: %.5f" % rmse)

    assert (R.shape == R_hat.shape)
    sparse_data = sparse.csr_matrix(R)
    predicted_ranks = metrics.rank_matrix(R_hat)
    precision = metrics.precision_at_k(predicted_ranks, sparse_data, k=k)
    recall = metrics.recall_at_k(predicted_ranks, sparse_data, k=k)
    print("Precision:%.3f%% Recall:%.3f%%" % (precision * 100, recall * 100))

    print("\nStoppping '%s" % sys.argv[0])
示例#10
0
    def deepfm_test(self):
        train_x, train_y = DeepFM.df2xy(self._ratings)
        #test_x, test_y = DeepFM.df2xy(self.test_data_)

        params = {
            'n_uid': self._ratings.userid.max(),
            'n_mid': self._ratings.itemid.max(),
            # 'n_genre': self.n_genre_,
            'k': self._k,
            'dnn_dim': [64, 64],
            'dnn_dr': 0.5,
            'filepath': '../data/deepfm_weights.h5'
        }

        """ train """
        model = DeepFM(**params)
        train_history = model.fit(train_x,
                                  train_y,
                                  epochs=self._epochs,
                                  batch_size=2048,
                                  validation_split=0.1)

        history = pd.DataFrame(train_history.history)
        history.plot()
        plt.savefig("../data//history.png")

        """ test """
        results = model.evaluate(train_x, train_y)
        print("Validate result:{0}".format(results))

        """ predict """
        y_hat = model.predict(train_x)

        print(np.shape(y_hat))
        # print(np.shape(test_y))

        """ Run Recall and Precision Metrics """
        n_users = np.max(self._ratings.userid.values) + 1
        n_items = np.max(self._ratings.itemid.values) + 1
        print("n_users={0} n_items={1}".format(n_users, n_items))

        # Convert to sparse matrix to run standard metrics
        sparse_train = sparse.coo_matrix((self._ratings.rating.values,
                                          (self._ratings.userid.values, self._ratings.itemid.values)),
                                         shape=(n_users, n_items))

        # sparse_test = sparse.coo_matrix((self.test_data_.rating.values, \
        #                                  (self.test_data_.uid.values, self.test_data_.mid.values)), \
        #                                 shape=(n_users, n_items))
        # pd.DataFrame(data=sparse_test.tocsr().todense().A).to_csv("./testdata.csv")

        # test_prediced
        test_predicted = self._ratings.copy()
        test_predicted.rating = np.round(y_hat)

        sparse_predicted = sparse.coo_matrix((test_predicted.rating.values, \
                                              (test_predicted.userid.values, test_predicted.itemid.values)), \
                                             shape=(n_users, n_items))

        sparse_train_1up = sparse_train.multiply(sparse_train >= 1)
        # sparse_test_1up = sparse_test.multiply(sparse_test >= 1)

        predicted_arr = sparse_predicted.tocsr().todense().A
        predicted_ranks = metrics.rank_matrix(predicted_arr)

        precision_ = metrics.precision_at_k(predicted_ranks, sparse_train, k=self._k)
        recall_ = metrics.recall_at_k(predicted_ranks,  sparse_train, k=self._k)

        print("{0}.xdeepfm_test train precision={1:.4f}% recall={2:.4f}% @k={3}".format(
            __class__.__name__, precision_ * 100, recall_ * 100, self._k))
示例#11
0
def main():
    print("\nStarting '%s'" % sys.argv[0])

    np.random.seed(8000)

    k = 100

    normalization_enabled = False
    """ Load dataset """
    datafile = "./data/ml-100k/u.data"
    data = pd.read_csv(datafile,
                       sep='\t',
                       names=["userid", "itemid", "rating", "timestamp"])
    """ Convert rating data to user x movie matrix format """
    data = data.sort_values(by=["userid", "itemid"])
    ratings = pd.pivot_table(data,
                             values="rating",
                             index="userid",
                             columns="itemid")
    ratings.fillna(0, inplace=True)

    # train_size = 0.7
    # train_row_size = int(len(ratings.index) * train_size)
    # train_col_size = int(len(ratings.columns) * train_size)
    # ratings = ratings.loc[:train_row_size, :train_col_size]
    users = np.unique(ratings.index.values)
    items = np.unique(ratings.columns.values)
    n_users = len(users)
    n_items = len(items)
    assert (np.max(users) == len(users))
    assert (np.max(items) == len(items))
    print("n_users=%d n_items=%d" % (n_users, n_items))
    """ Take the mean only from non-zero elements """
    temp = ratings.copy()
    rating_mean = temp.copy().replace(0, np.NaN).mean().mean()
    rating_mean = 3.5 if rating_mean > 3.5 else rating_mean
    print("Rating mean: %.2f" % rating_mean)

    if normalization_enabled:
        temp = ratings.copy()
        ratings_norm = np.subtract(temp, rating_mean, where=temp != 0)
        R = ratings_norm.values
    else:
        R = ratings.values

    U, S, V = linalg.svds(R, k=k)
    # print ("U: ", np.shape(U))
    # print ("S: ", np.shape(S))
    # print ("V: ", np.shape(V))
    sigma = np.diag(S)
    # print ("Sigma: ", np.shape(sigma))
    """ Generate prediction matrix """
    R_hat = np.dot(np.dot(U, sigma), V)
    assert (np.shape(R) == np.shape(R_hat))

    # Get errors only from explicitly rated elements
    R_mask = np.zeros(np.shape(R))
    R_mask[R != 0.000000] = 1
    R_hat_mask = np.zeros(np.shape(R))
    np.multiply(R_hat, R_mask, out=R_hat_mask)

    # Compute error: MSE = (1/N) * (R - Rˆ), RMSE = MSEˆ(1/2)
    assert (np.count_nonzero(R) == np.count_nonzero(R_hat_mask))
    diff = np.subtract(R, R_hat_mask)
    diff_square = np.square(diff)
    #mse = np.divide(diff_square.sum(), n_users*n_items)
    mse = np.divide(diff_square.sum(), np.count_nonzero(R_mask))
    rmse = np.sqrt(mse)
    print("RMSE: %.6f" % (rmse))

    assert (R.shape == R_hat.shape)
    interactions = sparse.csr_matrix(R)
    predicted_ranks = metrics.rank_matrix(R_hat)
    precision = metrics.precision_at_k(predicted_ranks, interactions, k=k)
    recall = metrics.recall_at_k(predicted_ranks, interactions, k=k)
    print("Precision:%.3f%% Recall:%.3f%%" % (precision * 100, recall * 100))

    print("\nStopping '%s'" % sys.argv[0])
示例#12
0
def topk_eval(sess,
              args,
              data_generator,
              model,
              user_list,
              train_record,
              eval_record,
              test_record,
              item_set,
              k_list,
              batch_size,
              mode='test'):

    precision_list = {k: [] for k in k_list}
    recall_list = {k: [] for k in k_list}
    MAP_list = {k: [] for k in k_list}
    hit_ratio_list = {k: [] for k in k_list}
    ndcg_list = {k: [] for k in k_list}

    for user in user_list:
        if mode == 'eval': ref_user = eval_record
        else: ref_user = test_record
        if user in ref_user:
            test_item_list = list(item_set - train_record[user])
            item_score_map = dict()
            start = 0

            while start + batch_size <= len(test_item_list):

                user_list_tmp = [user] * batch_size
                user_list_tmp = np.array(user_list_tmp)

                item_list = test_item_list[start:start + batch_size]
                item_list = np.array(item_list)

                labels_list = [1] * batch_size
                labels_list = np.array(labels_list)

                data = np.concatenate((np.expand_dims(
                    user_list_tmp, axis=1), np.expand_dims(item_list, axis=1),
                                       np.expand_dims(labels_list, axis=1)),
                                      axis=1)

                batch_data = data_generator.generate_rating_batch(
                    data, 0, args.batch_size)
                feed_dict = data_generator.generate_feed_rating_dict(
                    model, batch_data)
                items, scores = model.get_scores(sess, feed_dict)

                for item, score in zip(items, scores):
                    item_score_map[item] = score
                start += batch_size

            # padding the last incomplete minibatch if exists
            if start < len(test_item_list):

                user_list_tmp = [user] * batch_size
                user_list_tmp = np.array(user_list_tmp)
                item_list = test_item_list[start:] + [test_item_list[-1]] * (
                    batch_size - len(test_item_list) + start)
                item_list = np.array(item_list)
                labels_list = [1] * batch_size
                labels_list = np.array(labels_list)
                data = np.concatenate((np.expand_dims(
                    user_list_tmp, axis=1), np.expand_dims(item_list, axis=1),
                                       np.expand_dims(labels_list, axis=1)),
                                      axis=1)

                batch_data = data_generator.generate_rating_batch(
                    data, 0, args.batch_size)
                feed_dict = data_generator.generate_feed_rating_dict(
                    model, batch_data)
                items, scores = model.get_scores(sess, feed_dict)

                for item, score in zip(items, scores):
                    item_score_map[item] = score

            item_score_pair_sorted = sorted(item_score_map.items(),
                                            key=lambda x: x[1],
                                            reverse=True)
            item_sorted = [i[0] for i in item_score_pair_sorted]

            for k in k_list:
                precision_list[k].append(
                    precision_at_k(item_sorted, ref_user[user], k))
                recall_list[k].append(
                    recall_at_k(item_sorted, ref_user[user], k))

            # ndcg
            r_hit = []
            for i in item_sorted[:k]:
                if i in ref_user[user]:
                    r_hit.append(1)
                else:
                    r_hit.append(0)
            for k in k_list:
                ndcg_list[k].append(ndcg_at_k(r_hit, k))

    precision = [np.mean(precision_list[k]) for k in k_list]
    recall = [np.mean(recall_list[k]) for k in k_list]
    ndcg = [np.mean(ndcg_list[k]) for k in k_list]

    return precision, recall, ndcg, None, None
示例#13
0
def main():
    session = tf.Session()
    normalized_on = False
    k = 100
    """ load dataset """
    datafile = "./data/ml-100k/u.data"
    df = pd.read_csv(datafile,
                     sep='\t',
                     names=["userid", "itemid", "rating", "timestamp"])
    n_users = len(np.unique(df.userid))
    n_items = len(np.unique(df.itemid))
    rating_mean = np.mean(df.rating)
    rating_mean = 3.5 if rating_mean > 3.5 else rating_mean
    print("Raw data:")
    print("Shape: %s" % str(df.shape))
    print("Userid size: %d" % n_users)
    print("Itemid size: %d" % n_items)
    print("Rating mean: %.5f" % rating_mean)
    """ Format ratings to user x item matrix """
    df = df.sort_values(by=["userid", "itemid"])
    ratings = pd.pivot_table(df,
                             values="rating",
                             index="userid",
                             columns="itemid")
    ratings.fillna(0, inplace=True)
    print("Raw ratings size", len(ratings))
    ratings = ratings.astype(np.float64)
    """ Construct training data """
    # train_size = 0.7
    ratings_train_ = ratings  #.loc[:int(n_users*train_size), :int(n_items*train_size)]
    users = ratings_train_.index.values
    items = ratings_train_.columns.values
    n_users = len(users)
    n_items = len(items)
    temp = ratings_train_.copy()
    rating_mean = temp.replace(0, np.NaN).mean().mean()
    rating_mean = 3.5 if rating_mean > 3.5 else rating_mean

    print("Training data:")
    print("Shape: %s" % str(ratings_train_.shape))
    print("n_users: %d" % n_users)
    print("n_items: %d" % n_items)
    print("rating mean: %.5f" % rating_mean)

    user_indices = [x for x in range(n_users)]
    item_indices = [x for x in range(n_items)]

    print("Max userid train: ", np.max(users))
    print("Max itemid train", np.max(items))
    print("user_indices size ", len(user_indices))
    print("item_indices size ", len(item_indices))

    if normalized_on:
        ratings_norm = np.zeros(ratings_train_.shape)
        temp = ratings_train_.values
        np.subtract(temp, rating_mean, where=temp != 0, out=ratings_norm)
        ratings = ratings_norm
    else:
        ratings = ratings_train_.values

    # Variables
    n_features = 10  # latent factors
    U = tf.Variable(initial_value=tf.truncated_normal([n_users, n_features]))
    P = tf.Variable(initial_value=tf.truncated_normal([n_features, n_items]))

    result = tf.matmul(U, P)

    result_flatten = tf.reshape(result, [-1])
    assert (result_flatten.shape[0] == n_users * n_items)

    R = tf.gather(result_flatten, user_indices[:-1] * n_items + item_indices)
    assert (R.shape[0] == n_users * n_items)

    R_ = tf.reshape(R, [tf.div(R.shape[0], n_items), len(item_indices)])
    assert (R_.shape == ratings.shape)
    """ Compute error for values from the original ratings matrix 
        so that means excluding values implicitly computed by UxP """
    var = tf.Variable(ratings.astype(np.float32))
    compare = tf.not_equal(var, tf.constant(0.0))
    compare_op = var.assign(tf.where(compare, tf.ones_like(var), var))
    R_masked = tf.multiply(R_, compare_op)
    assert (ratings.shape == R_masked.shape)
    """ Cost function: sum_ij{ |r_ij- rhat_ij| + lambda*(|u_i|+|p_j|)} """
    diff_op = tf.subtract(ratings.astype(np.float32), R_masked)
    diff_op_abs = tf.abs(diff_op)
    base_cost = tf.reduce_sum(diff_op_abs)

    # Regularizer sum_ij{lambda*(|U_i| + |P_j|)}
    lambda_ = tf.constant(.001)
    norm_sums = tf.add(tf.reduce_sum(tf.abs(U)), tf.reduce_sum(tf.abs(P)))
    regularizer = tf.multiply(norm_sums, lambda_)
    cost = tf.add(base_cost, regularizer)
    """ Optimizer """
    lr = tf.constant(.0001)
    global_step = tf.Variable(0, trainable=False)
    decaying_learning_rate = tf.train.exponential_decay(lr,
                                                        global_step,
                                                        10000,
                                                        .96,
                                                        staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(
        decaying_learning_rate).minimize(cost, global_step=global_step)
    """ Run """
    init = tf.global_variables_initializer()
    session.run(init)

    print("Running stochastic gradient descent..")
    epoch = 500
    for i in range(epoch):
        session.run(optimizer)
        if i % 10 == 0 or i == epoch - 1:
            diff_op_train = tf.subtract(ratings.astype(np.float32), R_masked)
            diff_op_train_squared = tf.square(diff_op_train)
            se = tf.reduce_sum(diff_op_train_squared)
            mse = tf.divide(se, n_users * n_items)
            rmse = tf.sqrt(mse)
            print("Train iter: %d MSE: %.5f loss: %.5f" %
                  (i, session.run(rmse), session.run(cost)))

    R_hat = R_.eval(session=session)
    predicted_ranks = metrics.rank_matrix(R_hat)
    interactions = sparse.csr_matrix(ratings)
    precision = metrics.precision_at_k(predicted_ranks, interactions, k=k)
    recall = metrics.recall_at_k(predicted_ranks, interactions, k=k)

    print("Precision:%.3f%% Recall:%.3f%%" % (precision * 100, recall * 100))
示例#14
0
def main():
    logging.info('reading data')

    item_mat = data.get_mult()

    trainM = sparse.csr_matrix(
        data.read_user(f_in='data/dummy/cf-train-10-users.dat',
                       num_u=50,
                       num_v=1929))
    testM = sparse.csr_matrix(
        data.read_user(f_in='data/dummy/cf-test-10-users.dat',
                       num_u=50,
                       num_v=1929))

    trainList = list()
    testList = list()
    for user in range(trainM.shape[0]):
        negative = 0
        for item in range(trainM.shape[1]):
            if trainM[user, item] == 1:
                trainList.append([user, item, 1])
            else:
                if negative < 20:
                    trainList.append([user, item, 0])
                    negative += 1
        train = np.array(trainList).astype('float32')

    testList = list()
    for user in range(testM.shape[0]):
        negative = 0
        for item in range(testM.shape[1]):
            if testM[user, item] == 1:
                testList.append([user, item, 1])
    #        else:
    #            if negative < 10:
    #                testList.append( [user, item, 0] )
    #                negative+=1
        test = np.array(testList).astype('float32')

    num_item_feat = item_mat.shape[1]

    model = CollaborativeDeepLearning(item_mat, [num_item_feat, 50, 10])
    model.pretrain(lamda_w=0.001, encoder_noise=0.3, epochs=10)
    model_history = model.fineture(train,
                                   test,
                                   lamda_u=0.01,
                                   lamda_v=0.1,
                                   lamda_n=0.1,
                                   lr=0.01,
                                   epochs=500)
    testing_rmse = model.getRMSE(test)
    print('Testing RMSE = {}'.format(testing_rmse))

    import metrics
    print('AUC %s' % metrics.full_auc(model.cdl_model, testM))

    import matplotlib.pyplot as plt
    M_low = 50
    M_high = 300
    recall_levels = M_high - M_low + 1
    recallArray = np.zeros(6)
    x = 0
    for n in [50, 100, 150, 200, 250, 300]:
        test_recall = metrics.recall_at_k(model.cdl_model, testM, k=n)
        recallArray[x] = test_recall
        print('Recall: %.2f.' % (test_recall))
        x += 1
    plt.plot([50, 100, 150, 200, 250, 300], recallArray)
    plt.ylabel("Recall")
    plt.xlabel("M")
    plt.title("Proposed: Recall@M")
    plt.show()
示例#15
0
def main():
    print("\nStarting '%s'" % sys.argv[0])

    session = tf.Session()

    normalized_on = True

    """ load dataset """
    datafile = "./data/ml-100k/u.data"
    df = pd.read_csv(datafile, sep='\t', names=["userid", "itemid", "rating", "timestamp"])
    n_users = len(np.unique(df.userid))
    n_items = len(np.unique(df.itemid))
    rating_mean = np.mean(df.rating)
    rating_mean = 3.5 if rating_mean > 3.5 else rating_mean

    print ("Raw data:")
    print ("Shape: %s" % str(df.shape))
    print ("Userid size: %d" % n_users)
    print ("Itemid size: %d" % n_items)
    print ("Rating mean: %.5f" % rating_mean)

    """ Format ratings to user x item matrix """
    df = df.sort_values(by=["userid", "itemid"])
    ratings = pd.pivot_table(df, values="rating", index="userid", columns="itemid")
    ratings.fillna(0, inplace=True)
    print("Raw ratings size", len(ratings))
    ratings = ratings.astype(np.float64)

    """ Construct training data """
    # train_factor = 0.7
    # train_size = int(n_users*train_factor)
    # ratings_train_ = ratings.loc[:train_size, :int(n_items*train_factor)]
    users = ratings.index.values
    items = ratings.columns.values
    n_users = len(users)
    n_items = len(items)
    temp = ratings.copy()
    rating_mean = temp.replace(0, np.NaN).mean().mean()
    rating_mean = 3.5 if rating_mean > 3.5 else rating_mean

    print ("Training data:")
    print ("Shape: %s" % str(ratings.shape))
    print ("n_users: %d" % n_users)
    print ("n_items: %d" % n_items)
    print ("rating mean: %.5f" % rating_mean)

    user_indices = [x for x in range(n_users)]
    item_indices = [x for x in range(n_items)]

    print ("Max userid train: %d" % np.max(users))
    print ("Max itemid train: %d" % np.max(items))
    print ("user_indices size: %d" % len(user_indices))
    print ("item_indices size: %d " % len(item_indices))

    if normalized_on:
        ratings_norm = np.zeros(ratings.shape)
        temp = ratings.values
        np.subtract(temp, rating_mean, where=temp!=0, out=ratings_norm)
        ratings = ratings_norm
    else:
        ratings = ratings.values

    # Variables
    n_features = 10 # latent factors
    U = tf.Variable(initial_value=tf.truncated_normal([n_users, n_features]))
    P = tf.Variable(initial_value=tf.truncated_normal([n_features, n_items]))

    result = tf.matmul(U, P)

    result_flatten = tf.reshape(result, [-1])
    assert (result_flatten.shape[0] == n_users * n_items)

    print ("user indices size: %d item indices size: %d" % (len(user_indices), len(item_indices)))

    # Fill R from result_flatten
    R = tf.gather(result_flatten, user_indices[:-1] * n_items + item_indices)
    assert (R.shape == result_flatten.shape)

    # Format R to user x item sized matrix
    R_ = tf.reshape(R, [tf.div(R.shape[0], n_items), len(item_indices)])
    assert (R_.shape == ratings.shape)

    """ Compute error of fields from the original ratings matrix """
    var = tf.Variable(ratings.astype(np.float32))
    compare = tf.not_equal(var, tf.constant(0.0))
    compare_op = var.assign(tf.where(compare, tf.ones_like(var), var))
    R_mask = tf.multiply(R_, compare_op)
    assert (R_mask.shape == np.shape(ratings))

    """ Cost function: sum_ij{ |r_ij- rhat_ij| + lambda*(|u_i|+|p_j|)} """
    # cost |r - r_hat|
    diff_op = tf.subtract(ratings.astype(np.float32), R_mask)
    diff_op_abs = tf.abs(diff_op)
    base_cost = tf.reduce_sum(diff_op_abs)

    lambda_ = tf.constant(.001)
    norm_sums = tf.add(tf.reduce_sum(tf.abs(U)), tf.reduce_sum(tf.abs(P)))
    regularizer = tf.multiply(norm_sums, lambda_)
    cost = tf.add(base_cost, regularizer)

    """ Run """
    init = tf.global_variables_initializer()
    session.run(init)
    session.run(cost)

    """ Mean square error """
    diff_op_train = tf.subtract(ratings.astype(np.float32), R_mask)
    diff_op_train_squared = tf.square(diff_op_train)
    diff_op = tf.sqrt(tf.reduce_sum(diff_op_train_squared))
    cost_train = tf.divide(diff_op, ratings.shape[0])
    cost_train_result =  session.run(cost_train)
    print("Training MSE: %.5f" % cost_train_result)

    k = 100
    R_hat = R_.eval(session=session)
    print (ratings[:5, :5])
    print (R_hat[:5,:5])
    interactions = sparse.csr_matrix(ratings)
    predicted_ranks = metrics.rank_matrix(R_hat)
    precision = metrics.precision_at_k(predicted_ranks, interactions, k=100)
    recall = metrics.recall_at_k(predicted_ranks, interactions, k=100)
    print("Precision:%.3f%% Recall:%.3f%%" % (precision * 100, recall * 100))

    print("\nStopping '%s'" % sys.argv[0])