Exemplo n.º 1
0
    def valid(self, model):
        batch_size = self.params.get('batch_size', 128)

        invited_info_train = self.load('invited_info_train.pkl')

        question_info = self.load('question_info.pkl')
        user_info = self.load('user_info.pkl')

        question_words_seq = [
            list(question_info['words_seq'][x])
            for x in invited_info_train['question_id']
        ]

        # questions = list()
        # answers = list()

        answers_words_seq = [
            list(user_info['user_desc_words_sec'][x])
            for x in invited_info_train['user_id']
        ]

        question_words_seq = self.padq(question_words_seq)
        answers_words_seq = self.pada(answers_words_seq)

        predict = model.prediction_model.predict(
            [question_words_seq, answers_words_seq],
            batch_size=batch_size,
            verbose=1)
        # output = []
        # for i in invited_info_train.index:
        #   output.append([invited_info_train['question_id'][i],
        # invited_info_train[
        #     'user_id'][i], predict[i]])

        invited_info_train['predict'] = [x[0][0] for x in predict]
        train_group = invited_info_train.groupby('question_id')

        scores = list()

        for x in list(train_group):
            question_id = x[0]
            answer_info = x[1].sort_values('predict', ascending=False)
            predict = [
                answer_info['predict'][x] * answer_info['answer_flag'][x]
                for x in answer_info.index
            ]
            from ndcg import ndcg_at_k
            scores.append(
                ndcg_at_k(predict, 5) * 0.5 + ndcg_at_k(predict, 10) * 0.5)

        print('ndcg mean is %lf' % np.mean(scores))

        return train_group
Exemplo n.º 2
0
def handle_question(question_queue, invdata, new_valdata, results_queue):
     while True:
        qid = question_queue.get()
        if qid is None:
            break
            
        q_result = new_valdata[new_valdata.qid == qid]
        q_result.reset_index(drop = True, inplace = True)
        sorted_q_result = q_result.sort_values(['label'], axis=0, ascending=False)
        sorted_users = sorted_q_result['uid']
        r = []
        for uid in sorted_users:
            r.append(invdata[(invdata.qid == qid) & (invdata.uid == uid)].values[0][2])
        results_queue.put({"qid": qid, "val": (ndcg.ndcg_at_k(r,5) * 0.5) + (ndcg.ndcg_at_k(r,10) * 0.5)})
Exemplo n.º 3
0
 def ndcgerror(self, preds, dtrain):
     assert isinstance(dtrain, XGMatrix)
     labels = dtrain.get_label()
     group = dtrain.get_group()
     
     offset = 0
     scores = list()
     for size in group:
         p = preds[offset:offset + size]
         l = labels[offset:offset + size]
         offset += size
         
         rec_result = [(p[i], l[i]) for i in range(size)]
         rec_result = sorted(rec_result, key=lambda x: x[0], reverse=True)
         predict = [x[1] for x in rec_result]
         scores.append(
             ndcg_at_k(predict, 5) * 0.5 + ndcg_at_k(predict, 10) * 0.5)
     return 'ndcg_error', np.mean(scores)
Exemplo n.º 4
0
def ndcg(valfile):
    trueVal = getTrueVal()
    predProb = {}
    with open(valfile, 'r') as f1:
        line = f1.readline()
        for line in f1:
            qid, uid, prob = line.rstrip('\n').split(',')
            if qid not in predProb:
                predProb[qid] = []
            predProb[qid].append((uid, float(prob)))
    scores = []
    weights = []
    for qid in predProb:
        ranks = sorted(predProb[qid], key=itemgetter(1), reverse=True)
        r = []
        for rank in ranks:
            r.append(trueVal[(qid, rank[0])])
        s5 = ndcg_at_k(r, 5)
        s10 = ndcg_at_k(r, 10)
        scores.append(s5 * 0.5 + s10 * 0.5)
        weights.append(len(r))
    #print scores
    #print weights
    return np.average(scores, weights=weights)
Exemplo n.º 5
0
def ndcg4dataset(mapping, ques2user, k):
    def _map_gt(ques, user_score, mapping):
        return [mapping[(ques, user)] for user, score in user_score]

    for ques in ques2user:
        ques2user[ques].sort(key=lambda x: x[1], reverse=True)

        scores  = [ndcg_at_k(_map_gt(ques, ques2user[ques], mapping), \
                k, method = 1) \
                for ques in ques2user if k <= len(ques2user[ques])]
    evaluated_num = len(scores)
    #print(evaluated_num)
    ndcg_r_score = sum(scores) / evaluated_num

    return ndcg_r_score
Exemplo n.º 6
0
knn = NearestNeighbors(n_neighbors=11)
knn.fit(y)
NearestNeighbors(algorithm='auto',
                 leaf_size=30,
                 n_neighbors=11,
                 p=2,
                 radius=1.0)
for j in range(2, 11):
    avg_n = 0.0
    for i in range(0, len(y)):
        neighbours = knn.kneighbors(np.array(y[i]).reshape(1, -1),
                                    return_distance=False)
        #print(neighbours)
        val, vec = calcAccuracy(neighbours, i)
        #print vec
        #print j
        #print ndcg_at_k(vec, j-1)
        avg_n = avg_n + ndcg_at_k(vec, j - 1)
    avg_n = avg_n / len(y)
    print(avg_n)
'''
# compute final accuracy on training and test sets
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(tr_y, y_pred)
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(te_y, y_pred)

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
'''
Exemplo n.º 7
0
'''
Evaluating ndcg score on the predicted results
Note: (i) ndcg.py is assumed to be in ./bytecup2016data and
(ii) change path to point to the csv file that has to be evaluated
'''
import sys
sys.path.insert(0, './bytecup2016data/')
import ndcg as ndcg
path = "f3.csv"
finalcsv = pa.read_csv(path, sep=",",names=["qid","uid","label"] ) 
mydict = finalcsv.sort_values('label', ascending=False).groupby('qid')['uid'].apply(list).to_dict()
ndcg_5 = 0
ndcg_10 = 0
count = 0 
for key in mydict:
    user_list= mydict[key]
    ranking = []
    for user in user_list:
        value = training_data_formatted.loc[(training_data_formatted['qid'] == key) & (training_data_formatted['uid'] == user)]
        ranking.append(value['label'].values[0])
    ndcg_5 += ndcg.ndcg_at_k(ranking, 5)
    ndcg_10 += ndcg.ndcg_at_k(ranking, 10)
    count+=1
ndcg_5/=count
ndcg_10/=count
final_score = (ndcg_5+ndcg_10)*0.5
print final_score
    [3, 2, 2, 1, 2, 1, 1, 3, 1, 2],  #item 1 ranking in rank order
    [2, 0, 1, 1, 3, 3, 1, 1, 2, 1],  #item 2 ranking in rank order,  etc...
    [2, 2, 2, 1, 2, 3, 2, 1, 2, 1],
    [3, 3, 2, 2, 1, 1, 1, 2, 1, 3],
    [2, 2, 2, 1, 1, 1, 1, 1, 1, 1],
    [3, 3, 3, 3, 3, 3, 2, 1, 1, 1],
    [3, 3, 3, 3, 3, 3, 3, 3, 2, 2],
    [3, 3, 2, 2, 2, 2, 1, 1, 1, 0],
    [2, 3, 2, 2, 2, 3, 2, 0, 1, 1],
    [3, 3, 1, 0, 0, 0, 1, 0, 0, 0],
    [3, 3, 3, 3, 1, 1, 1, 1, 1, 1],
    [2, 1, 2, 2, 1, 1, 1, 0, 0, 0],
    [3, 3, 3, 3, 2, 1, 3, 2, 2, 2],
    [3, 2, 3, 2, 2, 1, 2, 2, 2, 2],
    [3, 3, 3, 1, 3, 2, 3, 2, 1, 1]
]

all_ndcg = []
for i, r in enumerate(all_r):
    ndcg = ndcg_at_k(r, k, method=1)
    all_ndcg.append(ndcg)

avg_ndcg = np.sum(all_ndcg) / len(all_ndcg)
stdev_ndcg = np.std(all_ndcg)
#std_err_ndcg = stdev_ndcg/np.sqrt(len(all_ndcg))
std_err_ndcg = stdev_ndcg / np.sqrt(50)

print(avg_ndcg)
print(std_err_ndcg)
print(all_ndcg)
Exemplo n.º 9
0
def test_ranking(model,
                 testing_path,
                 code_list,
                 image_features,
                 brands,
                 gpu=None):
    brand_list = brands['username'].tolist()
    data = pd.read_csv(testing_path).values
    test_brands = unique(data[:, 0])
    test_posts = unique(data[:, 1])
    brand_dict = {d[1]: d[0] for d in data}

    asp_model = VggModelAspects(model.brand_embeddings,
                                model.aspects_embeddings).eval()
    model_truncated = VggModelTruncated(model.fc1, model.fc2).eval()

    brand_ids = Variable(torch.LongTensor(
        [brand_list.index(el) for el in test_brands]),
                         volatile=True)
    image_features = Variable(torch.from_numpy(
        image_features[[code_list[el] for el in test_posts], :]),
                              volatile=True)

    if gpu:
        asp_model.cuda(gpu)
        model_truncated.cuda(gpu)
        brand_ids.cuda(gpu)
        image_features.cuda(gpu)

    # Computing Aspect Features
    aspects = asp_model(brand_ids)
    aspects = aspects.permute((1, 0, 2)).mean(0)

    # Computing Post Features
    posts = model_truncated(image_features)

    aspects = aspects.data
    posts = posts.data
    if gpu:
        aspects.cpu()
        posts.cpu()
    aspects = aspects.numpy()
    posts = posts.numpy()

    #Computing similarity scores
    scores = cosine_similarity(aspects, posts)

    queries = []
    pbar = tqdm(total=len(scores))

    verticals = {
        b['username']: b['vertical']
        for index, b in brands.iterrows()
    }
    for p in range(scores.shape[0]):

        # Computing evaluation metrics for a brand
        predictions = [(test_posts[j], scores[p, j], brand_dict[test_posts[j]])
                       for j in range(scores.shape[1])]
        s_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)

        pos = [v[1] for v in s_predictions if brand_list[p] == v[-1]]
        neg = [v[1] for v in s_predictions if brand_list[p] != v[-1]]
        comp = [
            v[1] for v in s_predictions if (brand_list[p] != v[-1])
            & (verticals[brand_list[p]] == verticals[v[-1]])
        ]

        sum = np.sum([len([el for el in neg if e > el]) for e in pos])
        rank_of_first_pos = zip(*s_predictions)[-1].index(brand_list[p])
        queries.append(
            (rank_of_first_pos, float(sum) / (len(pos) * len(neg)),
             float(np.sum([len([el for el in comp if e > el])
                           for e in pos])) / (len(pos) * len(comp)),
             ndcg_at_k(
                 [1 if brand_list[p] == v[-1] else 0 for v in s_predictions],
                 10),
             ndcg_at_k(
                 [1 if brand_list[p] == v[-1] else 0 for v in s_predictions],
                 50)))
        pbar.update(1)

    pbar.close()
    queries = zip(*queries)
    return (
        np.median(queries[0]),  # MedR
        np.average(queries[1]),  # AUC
        np.average(queries[2]),  # cAUC
        np.average(queries[3]),  # NDCG@10
        np.average(queries[4])  # NDCG@50
    )
 def train(self, model):
     save_every = self.params.get('save_every', None)
     batch_size = self.params.get('batch_size', 128)
     nb_epoch = self.params.get('nb_epoch', 10)
     split = self.params.get('validation_split', 0)
     
     bad_answer_sample = self.params.get('bad_answer_sample', 0)
     
     training_set = self.load('invited_info_train.pkl')
     question_info = self.load('question_info.pkl')
     user_info = self.load('user_info.pkl')
     
     # questions = list()
     # answers = list()
     
     train_group = training_set.groupby('question_id')
     
     all_users = list(user_info.index)
     
     question_ids = list()
     good_answer_ids = list()
     bad_answer_ids = list()
     
     questions = [x[0] for x in list(train_group)]
     len_split = int(len(questions) * split)
     valid_questions = random.sample(questions, len_split)
     
     valid_qid = list()
     valid_uid = list()
     valid_answer = list()
     
     for x in list(train_group):
         question_id = x[0]
         answer_info = x[1]
         if valid_questions.__contains__(question_id):
             for info in answer_info.values:
                 valid_qid.append(info[0])
                 valid_uid.append(info[1])
                 valid_answer.append(info[2])
         else:
             good_bad = [(g, b) for g in answer_info['user_id'][
                 answer_info.answer_flag == 1] for b in
                         answer_info['user_id'][
                             answer_info.answer_flag == 0]]
             for gb in good_bad:
                 question_ids.append(question_id)
                 good_answer_ids.append(gb[0])
                 bad_answer_ids.append(gb[1])
                 bad_sample = random.sample(all_users, bad_answer_sample)
                 for bad in bad_sample:
                     question_ids.append(question_id)
                     good_answer_ids.append(gb[0])
                     bad_answer_ids.append(bad)
     
     sample = self.conf.get('sample')
     if sample > 0:
         print('Selected sample, num is %d' % sample)
         sample = random.sample(range(len(question_ids)), sample)
         question_ids = [question_ids[s] for s in sample]
         good_answer_ids = [good_answer_ids[s] for s in sample]
         bad_answer_ids = [bad_answer_ids[s] for s in sample]
     
     question_words_seq = [
         list(question_info['words_seq'][x])
         for x in question_ids]
     
     answers_good_words_seq = [
         list(user_info['user_desc_words_sec'][x])
         for x in good_answer_ids]
     
     answers_bad_words_seq = [
         list(user_info['user_desc_words_sec'][x])
         for x in bad_answer_ids]
     
     # y = np.array(list(training_set['answer_flag']))
     
     # questions = self.padq(questions)
     question_words_seq = self.padq(question_words_seq)
     answers_good_words_seq = self.pada(answers_good_words_seq)
     answers_bad_words_seq = self.pada(answers_bad_words_seq)
     
     # valid set
     valid_question_words_seq = [
         list(question_info['words_seq'][x])
         for x in valid_qid]
     
     valid_answers_words_seq = [
         list(user_info['user_desc_words_sec'][x])
         for x in valid_uid]
     
     valid_question_words_seq = self.padq(valid_question_words_seq)
     valid_answers_words_seq = self.padq(valid_answers_words_seq)
     valid_data = {'qid': valid_qid, 'uid': valid_uid, 'answer_flag':
         valid_answer}
     valid_set = pd.DataFrame(data=valid_data)
     
     # val_loss = {'loss': 1., 'epoch': 0}
     val_ndcg = {'ndcg': 0, 'epoch': 0}
     
     self.save_conf()
     
     for i in range(1, nb_epoch):
         # sample from all answers to get bad answers
         print('Epoch %d :: ' % i, end='')
         self.print_time()
         hist = model.fit([question_words_seq, answers_good_words_seq,
                           answers_bad_words_seq],
                          nb_epoch=1,
                          batch_size=batch_size,
                          # validation_split=split
                          )
         
         predict = model.prediction_model.predict(
                 [valid_question_words_seq, valid_answers_words_seq],
                 batch_size=batch_size, verbose=1)
         
         valid_set['predict'] = [x[0][0] for x in predict]
         
         valid_group = valid_set.groupby('qid')
         
         scores = list()
         
         for x in list(valid_group):
             # question_id = x[0]
             answer_info = x[1].sort_values('predict', ascending=False)
             predict = [
                 answer_info['predict'][x] * answer_info['answer_flag'][x]
                 for x in answer_info.index]
             from ndcg import ndcg_at_k
             scores.append(
                     ndcg_at_k(predict, 5) * 0.5 + ndcg_at_k(predict,
                                                             10) * 0.5)
         
         valid_ndcg = np.mean(scores)
         print('ndcg mean is %lf' % valid_ndcg)
         if valid_ndcg > val_ndcg['ndcg']:
             val_ndcg = {'ndcg': valid_ndcg, 'epoch': i}
         
         print('Best: Ndcg = {}, Epoch = {}'.format(val_ndcg['ndcg'],
                                                    val_ndcg['epoch']))
         
         # if hist.history['val_loss'][0] < val_loss['loss']:
         #   val_loss = {'loss': hist.history['val_loss'][0], 'epoch': i}
         # print('Best: Loss = {}, Epoch = {}'.format(val_loss['loss'],
         #                                            val_loss['epoch']))
         
         if save_every is not None and i % save_every == 0:
             self.save_epoch(model, i)
     
     # return val_loss
     return val_ndcg