Exemplo n.º 1
0
    def xls_to_csv(self):
        num_user = 0

        x = xlrd.open_workbook('../data/jester-data-1/jester-data-1.xls')
        x1 = x.sheet_by_name('jester-data-1-new')

        list_item_seem_by_user_test = []
        list_item_seem_by_user_train = []
        for rownum in xrange(x1.nrows):  #To determine the total rows.

            self.check = 0
            for idx, val in enumerate(x1.row_values(rownum)):
                if idx == 0 and val >= 50:
                    break
                #     Lấy so user tu idx = 1 va co danh gia (!=99)
                if idx != 0 and val != 99 and num_user <= 50:
                    list_item_seem_by_user_train.append([rownum, idx, val])
                    self.check = 1
                elif idx != 0 and val != 99:
                    list_item_seem_by_user_test.append([rownum, idx, val])

            if (self.check == 1):
                num_user += 1

        WriteFile(out_test_file, list_item_seem_by_user_test).write()
        WriteFile(out_train_file, list_item_seem_by_user_train).write()
Exemplo n.º 2
0
    def write_files(self, trained_model):
        fold = 0
        for train_index, test_index in trained_model:
            if self.dir_folds is not None:
                train_file = self.dir_folds + str(fold) + '/train.dat'
                test_file = self.dir_folds + str(fold) + '/test.dat'

                df_train = self.df.iloc[train_index]
                df_test = self.df.iloc[test_index]

                WriteFile(train_file, sep=self.sep_write, mode=self.write_mode
                          ).write_with_pandas(df_train.sort_values(by=[0, 1]))
                WriteFile(test_file, sep=self.sep_write, mode=self.write_mode
                          ).write_with_pandas(df_test.sort_values(by=[0, 1]))

                fold += 1
    def write_ranking(self):
        """
        Method to write final ranking

        """

        if self.output_file is not None:
            WriteFile(self.output_file, data=self.ranking, sep=self.sep).write()
    def write_predictions(self):
        """
        Method to write final ranking

        """

        if self.output_file is not None:
            WriteFile(self.output_file, data=self.predictions, sep=self.sep).write()
    def recommendation_step(self):
        for user in self.test_set['users']:
            user_id = self.user_to_user_id[user]
            bu, hu = mean_confidence_interval(list(
                self.train_set['feedback'][user].values()),
                                              confidence=.95)

            for item in self.test_set['items_seen_by_user'][user]:
                cluster = self.father_of[user_id]
                '''
                mi^k - > media das notas do item em um subconjunto k
                mu -> media das notas do user u 
                * utilizar o h -> a diferenca entra a media e a borda do intervalo (Soh para subir a arvore?)
                
                rui = (wi * mi^k + wu * mu) / (wi + wu) 
                
                '''

                bi = 0
                last_h = float('inf')

                while True:
                    if cluster is None:
                        break

                    if self.cluster_item_interval[cluster].get(item, -1) == -1:
                        cluster = self.father_of[cluster]
                    else:
                        new_h = self.cluster_item_interval[cluster][item][1]

                        if np.isnan(new_h) or new_h == 0:
                            bi = self.cluster_item_interval[cluster][item][0]
                            cluster = self.father_of[cluster]

                        elif new_h < last_h:
                            last_h = new_h
                            bi = self.cluster_item_interval[cluster][item][0]
                            cluster = self.father_of[cluster]

                        else:
                            cluster = self.father_of[cluster]

                if bi == 0:
                    rui = bu
                else:
                    rui = .5 * bu + .5 * bi

                self.predictions.append((user, item, rui))

        self.predictions = sorted(self.predictions, key=lambda x: x[1])

        if self.output_file is not None:
            WriteFile(self.output_file, data=self.predictions,
                      sep=self.sep).write()
    def generate_groups(self):
        fold_for_sets = self.dir_name + '/gb_train_' + str(self.parser) + '/'
        if not os.path.exists(fold_for_sets):
            os.mkdir(fold_for_sets)

        train_tuple = self.run_kmedoids()
        self.k_groups = len(train_tuple)
        for f in range(len(train_tuple)):
            train_file_name = fold_for_sets + 'train_%d.dat' % f
            WriteFile(train_file_name, data=train_tuple[f],
                      sep=self.sep).write()
            self.gb_train_files.append(train_file_name)
        del self.train_set_list
Exemplo n.º 7
0
def case_rec_evaluation(sess, args, model, data, ripple_set, batch_size):
    predictions_output_filepath = '../data/' + args.dataset + '/ripplenet_preds.dat'
    test_output_filepath = '../data/' + args.dataset + '/ripplenet_tests.dat'

    i_map = load_dict('../data/' + args.dataset + '/i_map.txt')
    u_map = load_dict('../data/' + args.dataset + '/i_map.txt')

    start = 0
    print_preds = []
    while start < data.shape[0]:
        feed_dict = get_feed_dict(args, model, data, ripple_set, start, start + batch_size)
        labels, scores = sess.run([model.labels, model.scores_normalized], feed_dict)
        print('len_scores:%d\tlen_items:%s'% (len(scores),len(feed_dict[model.items])))
        #for u, u_scores in enumerate(scores):
        #    for i, score in enumerate(u_scores):
        #        print_preds.append((u_map[start+u], i_map[i], score))
        start += batch_size
    WriteFile(predictions_output_filepath, data=print_preds, sep='\t').write()

    print_tests = []
    for u, u_data in enumerate(data):
        for i, score in enumerate(u_data):
            print_tests.append((u_map[start+u], i_map[i], score))
    WriteFile(test_output_filepath, data=print_tests, sep='\t').write()

    # Using CaseRecommender ReadFile class to read test_set from file
    eval_data = ReadFile(input_file=test_output_filepath).read()
    predictions_data = ReadFile(input_file=predictions_output_filepath).read()

    # Creating CaseRecommender evaluator with item-recommendation parameters
    evaluator = ItemRecommendationEvaluation(n_ranks=[10])

    # Getting evaluation
    item_rec_metrics = evaluator.evaluate(predictions_data['feedback'], eval_data)
    print ('\nItem Recommendation Metrics:\n', item_rec_metrics)

    return item_rec_metrics
def case_rec_evaluator(test_file, predictions_file, top_score_dict):
        print_list = []
        for user, item_list in top_score_dict.items():
            for item in item_list:
                score = 1.0 / (item_list.index(item) + 1)
                print_list.append((int(user[1:]), int(item[1:]), float(score)))
        WriteFile(predictions_file, data=print_list, sep='\t').write()

        # Using CaseRecommender ReadFile class to read test_set from file
        eval_data = ReadFile(input_file=test_file).read()
        predictions_data = ReadFile(input_file=predictions_file).read()

        # Creating CaseRecommender evaluator with item-recommendation parameters
        evaluator = ItemRecommendationEvaluation(n_ranks=[10])

        # Getting evaluation
        item_rec_metrics = evaluator.evaluate(predictions_data['feedback'], eval_data)

        print ('\nItem Recommendation Metrics:\n', item_rec_metrics)

        return item_rec_metrics
Exemplo n.º 9
0
def case_rec_evaluation(sess, model, users_to_test, Ks, drop_flag=False, batch_test_flag=False):
    batch_test_flag=False
    ### Added:
    preds_output_filepath = '../Data/ml1m-sun2kgat/kgat_pred.txt'
    test_output_filepath = '../Data/ml1m-sun2kgat/case_rec_test.txt'
    ### Added-
    result = {'precision': np.zeros(len(Ks)), 'recall': np.zeros(len(Ks)), 'ndcg': np.zeros(len(Ks)),
              'hit_ratio': np.zeros(len(Ks)), 'auc': 0.}

    ### Removed:
    ### pool = multiprocessing.Pool(cores)
    ### Removed-

    if args.model_type in ['ripple']:
        u_batch_size = BATCH_SIZE
        i_batch_size = BATCH_SIZE // 20
    elif args.model_type in ['fm', 'nfm']:
        u_batch_size = BATCH_SIZE
        i_batch_size = BATCH_SIZE
    else:
        u_batch_size = BATCH_SIZE * 2
        i_batch_size = BATCH_SIZE

    test_users = users_to_test
    n_test_users = len(test_users)
    n_user_batchs = n_test_users // u_batch_size + 1

    count = 0

    print_preds = []
    for u_batch_id in range(n_user_batchs):
        start = u_batch_id * u_batch_size
        end = (u_batch_id + 1) * u_batch_size

        user_batch = test_users[start: end]

#        if batch_test_flag:

#            n_item_batchs = ITEM_NUM // i_batch_size + 1
#            rate_batch = np.zeros(shape=(len(user_batch), ITEM_NUM))
#
#            i_count = 0
#            for i_batch_id in range(n_item_batchs):
#                i_start = i_batch_id * i_batch_size
#                i_end = min((i_batch_id + 1) * i_batch_size, ITEM_NUM)
#
#                item_batch = range(i_start, i_end)
#
#                feed_dict = data_generator.generate_test_feed_dict(model=model,
#                                                                   user_batch=user_batch,
#                                                                   item_batch=item_batch,
#                                                                   drop_flag=drop_flag)
#                i_rate_batch = model.eval(sess, feed_dict=feed_dict)
#                i_rate_batch = i_rate_batch.reshape((-1, len(item_batch)))
#
#                rate_batch[:, i_start: i_end] = i_rate_batch
#                i_count += i_rate_batch.shape[1]
#
#            assert i_count == ITEM_NUM

#        else:
        item_batch = range(ITEM_NUM)
        feed_dict = data_generator.generate_test_feed_dict(model=model,
                                                           user_batch=user_batch,
                                                           item_batch=item_batch,
                                                           drop_flag=drop_flag)
        rate_batch = model.eval(sess, feed_dict=feed_dict)
        rate_batch = rate_batch.reshape((-1, len(item_batch)))

        user_batch_rating_uid = zip(rate_batch, user_batch)
        ### Removed: from function: test in utility/batch_test.py
        ## batch_result = pool.map(test_one_user, user_batch_rating_uid)
        ### Removed-
        ### Added: from function test_one_user in utility/batch_test.py:

        for rating, u in user_batch_rating_uid:
            try:
                training_items = data_generator.train_user_dict[u]
            except Exception:
                training_items = []

            all_items = set(range(data_generator.n_items))

            test_items = list(all_items - set(training_items))

            item_score = {}
            for i in test_items:
                item_score[i] = rating[i]

            K_max = max(Ks)
            K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)

            for i in K_max_item_score:
                score = item_score[i]
                print_preds.append((u, i, score))

    WriteFile(preds_output_filepath, data=print_preds, sep='\t').write()

        #for rating, u in user_batch_rating_uid:
        #    #user u's items in the test set
        #    user_pos_test = data_generator.test_user_dict[u]
        #    for i in user_pos_test:
        #        print_tests.append((u, i))
        #
        #WriteFile(test_output_filepath, data=print_tests, sep='\t', as_binary=True).write()

        ### Added-
    ### Removed:
    ###    count += len(batch_result)

    ###for re in batch_result: result['precision'] +=
    ###    re['precision']/n_test_users result['recall'] +=
    ###    re['recall']/n_test_users result['ndcg'] += re['ndcg']/n_test_users
    ###    result['hit_ratio'] += re['hit_ratio']/n_test_users result['auc'] +=
    ###    re['auc']/n_test_users

    ###assert count == n_test_users

    ### pool.close()
    ### Removed:

    # Using CaseRecommender ReadFile class to read test_set from file
    eval_data = ReadFile(input_file=test_output_filepath, as_binary=True).read()
    predictions_data = ReadFile(input_file=preds_output_filepath).read()

    # Creating CaseRecommender evaluator with item-recommendation parameters
    evaluator = ItemRecommendationEvaluation(n_ranks=[10])

    # Getting evaluation
    item_rec_metrics = evaluator.evaluate(predictions_data['feedback'], eval_data)
    print ('\nItem Recommendation Metrics:\n', item_rec_metrics)

    return item_rec_metrics
def case_rec_evaluateRec(FLAGS,
                         model,
                         eval_iter,
                         eval_dict,
                         all_dicts,
                         i_map,
                         logger,
                         i,
                         eval_descending=True,
                         is_report=False):
    # Evaluate
    total_batches = len(eval_iter)
    # processing bar
    pbar = tqdm(total=total_batches)
    pbar.set_description("Run Eval")

    all_i_var = None
    if FLAGS.share_embeddings:
        all_i_ids = [i_map[i] for i in range(len(i_map))]
        all_i_var = to_gpu(V(torch.LongTensor(all_i_ids)))

    model.eval()
    model.disable_grad()

    results = []
    for u_ids in eval_iter:
        u_var = to_gpu(V(torch.LongTensor(u_ids)))
        # batch * item
        scores = model.evaluateRec(u_var, all_i_ids=all_i_var)
        preds = zip(u_ids, scores.data.cpu().numpy())

        results.extend(
            evalRecProcess(list(preds),
                           eval_dict,
                           all_dicts=all_dicts,
                           descending=eval_descending,
                           num_processes=FLAGS.num_processes,
                           topn=FLAGS.topn,
                           queue_limit=FLAGS.max_queue))

        pbar.update(1)
    pbar.close()

    predictions = [result[5] for result in results
                   ]  # [(pred[0], top_ids, gold), ...], gold is test
    print("Saving predictions. Size: {}.".format(str(len(predictions))))

    predictions_output_filepath = os.path.join(
        FLAGS.log_path, FLAGS.experiment_name + '_pred.dat')
    print_list = []
    for triple in predictions:
        u_id = triple[0]
        top_ids = triple[1]
        #gold = triple[2]
        for i_id in top_ids:
            score = 1.0 / (top_ids.index(i_id) + 1)
            print_list.append((u_id, i_id, score))
    WriteFile(predictions_output_filepath, data=print_list, sep='\t').write()

    # Using CaseRecommender ReadFile class to read test_set from file
    dataset_path = os.path.join(FLAGS.data_path, FLAGS.dataset)
    eval_files = FLAGS.rec_test_files.split(':')
    test_path = os.path.join(dataset_path, eval_files[i])
    eval_data = ReadFile(input_file=test_path).read()
    predictions_data = ReadFile(input_file=predictions_output_filepath).read()
    print("Reading predictions. Size: {}.".format(
        str(len(predictions_data['feedback']))))

    # Creating CaseRecommender evaluator with item-recommendation parameters
    evaluator = ItemRecommendationEvaluation(n_ranks=[10])
    item_rec_metrics = evaluator.evaluate(predictions_data['feedback'],
                                          eval_data)
    print("From CaseRecommender evaluator: {}.".format(str(item_rec_metrics)))
    logger.info("From CaseRecommender evaluator: {}.".format(
        str(item_rec_metrics)))

    # Creating kg-summ-rec evaluator with diversity parameters
    dataset_name = os.path.basename(
        os.path.dirname(os.path.dirname(FLAGS.log_path)))
    tags = dataset_name.split('_')
    if tags[0] == 'ml-sun':
        evaluator2 = DiversityEvaluation(n_ranks=[10])
        dataset_path = os.path.normpath(FLAGS.data_path + os.sep + os.pardir)
        #tags = dataset_name.split('-')
        #if len(tags) > 2:
        #    mode = dataset_name.split('-')[2]
        #    ratio = dataset_name.split('-')[4]
        #else:
        #    mode = 'sv'
        #    ratio = '100'
        dataset_path = os.path.normpath(FLAGS.data_path + os.sep + os.pardir +
                                        os.sep + os.pardir + os.sep + tags[0] +
                                        '_' + tags[1] + '_' + 'oKG')
        mode = 'sv'
        ratio = '100'
        i2genre_map = read_i2genre_map(dataset_path, mode, ratio)
        diversity_metrics = evaluator2.evaluate(predictions_data['feedback'],
                                                eval_data, i2genre_map)
        print("From kg-summ-rec diversity evaluator: {}.".format(
            str(diversity_metrics)))
        logger.info("From kg-summ-rec diversity evaluator: {}.".format(
            str(diversity_metrics)))

    model.enable_grad()
    return item_rec_metrics
Exemplo n.º 11
0
 def export_data(self):
     self.processData()
     WriteFile(self.outTestFile, self.test_data).write()
     WriteFile(self.outTrainFile, self.train_data).write()