def computeNDCG_GR(base_path,result_path):
    total = 0
    num = 0
    idx = 0
    low = 0
    base_list = parse_baseline(base_path)
    session_list = session_GR(result_path)
    for session in session_list:
        if base_list[idx]['session_id'] == session[0]:
            dict = {}
            lists = []
            for url1 in base_list[idx]['url_list']:
                dict[int(url1[1])] = url1[0]
            for k in range(len(session[1])):
                score = dict[int(session[1][k][1])]
                # if score == 0:
                #     score =0.4/math.log(k+2,2)
                lists.append( (score,session[1][k][1]) )
            # a,b = computeNDCG(lists),computeNDCG(base_list[idx]['url_list'])
            if NDCG.computeNDCG(base_list[idx]['url_list']) > NDCG.computeNDCG(lists):
                # if (computeNDCG(base_list[idx]['url_list']) - computeNDCG(lists))>0.4:
                    # print('new: ',computeNDCG(lists),session[1])
                    # print('old: ',computeNDCG(base_list[idx]['url_list']),base_list[idx]['url_list'])
                low+=1
            total += NDCG.computeNDCG(lists)
            num+=1
        else:
            print('not find corresponding session')
            break
        idx+=1
    print ('Global Reward:',total/num)
    return total/num
Exemplo n.º 2
0
 def scoring(self):
     # ベストモデルを使用した時のNDCGを計算
     mod = super().grid_search()
     pred = mod.predict(self.X_test)
     print('NDCG:', NDCG.ndcg2(self.y_test.action_type.values, pred, k=20))
     print('Learning Done')
     return mod
def compute_total_reward(test_path,result_path,base_path,parameter):
    gr = session_GR(result_path,parameter)
    lr = session_LR(test_path,base_path,parameter)
    ndcg = 0
    base_ndcg = 0#test
    num = 0
    with open(base_path,'r',newline='') as base:
        for line in base:
            # num+=1

            flag = False #test

            baseline = json.loads(line)
            base_url_list = baseline['url_list']
            # print(base_url_list)
            global_reward = gr.__next__()
            local_reward = lr.__next__()
            new_url_list = []
            if int(global_reward[0]) == int(local_reward[0]):
                gr_list = global_reward[1]
                lr_list = local_reward[1]

                for lr_item in lr_list: #test
                    if lr_item[0] > 0:
                        flag = True
                        break

                for i in range(len(gr_list)):
                    if int(gr_list[i][1]) == int(lr_list[i][1]):
                        # score = (gr_list[i][0])*0.7+lr_list[i][0]*0.3*0.51
                        score = (gr_list[i][0])*parameter['bs']\
                                +lr_list[i][0]*(1-parameter['bs'])\
                                 *((global_reward[2]*parameter['s_rel'])+((1-global_reward[2])*parameter['s_irel']))
                        new_url_list.append([round(score,6),int(lr_list[i][1])])
                new_url_list.sort(reverse = True)
                # print(new_url_list)
            final_url_list = recover_rel(base_url_list,new_url_list)
            # if NDCG.computeNDCG(final_url_list) < 1:

            # if  flag:#test
                # print('gr',global_reward)
                # print('lr',local_reward)
                # print('final',final_url_list)
                # print(base_url_list)
                # num+=1
                # ndcg+=NDCG.computeNDCG(final_url_list)
                # base_ndcg+=NDCG.computeNDCG(base_url_list)

            num+=1
            ndcg+=NDCG.computeNDCG(final_url_list)


            # if NDCG.computeNDCG(final_url_list) < 0.5:
            #     print('gr',global_reward)
            #     print('lr',local_reward)
            #     print('final',final_url_list)
            #     print(base_url_list)

    print('Total Reward:',ndcg/num)#test
    return (ndcg/num)
def sample(test_path,base_path):
    s = parse_test_session(test_path)
    a,b=0,0
    with open(base_path,'r',newline='') as base:
        for line in base:
            baseline = json.loads(line)
            base_url_list = baseline['url_list']
            base_serp = baseline['serp']
            base_dict = make_dict(base_url_list)
            session = s.__next__()
            temp_url_list = []
            if session['session_id'] == baseline['session_id']:
                dict = collections.defaultdict(int)
                for query in session['query_list']:
                    q_url_list = query['url_list']
                    serp_now = query['serp']
                    if serp_now != base_serp:
                        for i in range(len(q_url_list)):
                            if q_url_list[i][0] > 1:
                                score = ((q_url_list[i][0])-1)/math.log(i+2,2)
                                score = (score*math.pow(0.8,(base_serp-serp_now)))/2
                                if score > dict[q_url_list[i][1]]:
                                    dict[q_url_list[i][1]] = score
                for idx in range(len(base_url_list)):
                    base = 1/math.log(idx+2,2)
                    temp_url_list.append( [round(base+dict[base_url_list[idx][1]],2),base_url_list[idx][1]] )
                temp_url_list.sort(reverse=True)
                for url in temp_url_list:
                    url[0] = base_dict[url[1]]
                new_ndcg = NDCG.computeNDCG(temp_url_list)
                old_ndcg = NDCG.computeNDCG(base_url_list)
                if new_ndcg > old_ndcg:
                    # print(session)
                    # print('new:',new_ndcg,temp_url_list)
                    # print('old:',old_ndcg,base_url_list)
                    a+=1
                elif new_ndcg < old_ndcg:
                    print(session)
                    print(temp_url_list)
                    b+=1
            else:
                print('error,not found corresponding base or session')
                break
    print('a',a,'b',b)
Exemplo n.º 5
0
def RANK(True_rank,UIP,UIT):
    Pred_rank = {}
    T_NDCG = []
    for i in True_rank.keys():
        if len(UIP[i])<5:
            continue
        if len(np.unique(UIT[i]))==1:
            continue
        Pred_rank[i] = True_rank[i][np.argsort(UIP[i])][::-1]
        T_NDCG.append( NDCG(Pred_rank[i]) )
        #T_NDCG[i] =  NDCG(Pred_rank[i])
    print np.mean(T_NDCG)
    return T_NDCG
def test_local_method_proportion(test_path,base_path):
    s = parse_test_session(test_path)
    a,b = 0,0
    b_ndcg = 0
    a_ndcg = 0
    total = 0
    with open(base_path,'r',newline='') as base:
        for line in base:
            total+=1
            baseline = json.loads(line)
            base_url_list = baseline['url_list']
            base_serp = baseline['serp']
            session = s.__next__()
            test_dict = collections.defaultdict(int)
            if session['session_id'] == baseline['session_id']:
                for query in session['query_list']:
                    q_url_list = query['url_list']
                    serp_now = query['serp']
                    if serp_now != base_serp:
                        for i in range(len(q_url_list)):
                            if q_url_list[i][0] > 1:
                                test_dict[q_url_list[i][1]]=q_url_list[i][0]
                for url in base_url_list:
                    if test_dict[url[1]] > 1 and url[0] == 1:
                        a+=1
                        a_ndcg+=NDCG.computeNDCG(base_url_list)
                        print(session)
                        # break
                    elif url[0]>1 and test_dict[url[1]] > 1:
                        b+=1
                        b_ndcg+=NDCG.computeNDCG(base_url_list)
                        # break
            else:
                print('error,not found corresponding base or session')
                break
    print('a',a,a_ndcg/a,'b',b,b_ndcg/b,'total',total)
def ndcg_distribute(path):
    ndcg_list = [[i/10,0] for i in range(1,11)]
    with open(path,'r') as baseline:
        for line in baseline:
            url_list = json.loads(line)['url_list']
            ndcg = NDCG.computeNDCG(url_list)
            for item in ndcg_list:
                if ndcg < item[0]:
                    item[1]+=1
                    break
    print(ndcg_list)
    x = [i[0]for i in ndcg_list]
    y = [i[1]for i in ndcg_list]
    plt.bar(x, y,-0.1,color='y', edgecolor='g', linewidth=1, align='edge')
    plt.show()
Exemplo n.º 8
0
def RANK(True_rank,UIP,UIT,N):
    Pred_rank = {}
    T_NDCG = {}
    USER = 0
    for i in UIP.keys():
        if len(UIP[i])<=N:
            continue
        if len(np.unique(UIT[i]))==1:
            continue
        Pred_rank[i] = True_rank[i][np.argsort(UIP[i])][::-1][:N]
        if len(np.unique(Pred_rank[i]))==1:
            continue
        T_NDCG[i] = NDCG(Pred_rank[i])
        USER+=1
    return T_NDCG
    a=sort_rows_by_icol_descendent(prob,4)    #first let's rank by click_predict
    a=sort_rows_by_icol_descendent(a,5)       #now let's rank by book_predict
    a=sort_rows_by_icol_increment(a,0)

    matrix=np.array(a[:,0:4])

#print NDCG.NDCG(matrix)

#for the test data


    book_predict_test=rf_book.predict_proba(X_test)
    click_predict_test=rf_click.predict_proba(X_test)
#print len(book_predict_test)
#print len(click_predict_test)

    prob_test=np.array([x_srch_id_test,x_hotel_id_test,y_click_test,y_book_test,click_predict_test[:,1],book_predict_test[:,1]])
    prob_test=np.array(map(list, zip(*prob_test)))

    b=sort_rows_by_icol_descendent(prob_test,4)    #first let's rank by click_predict
    b=sort_rows_by_icol_descendent(b,5)       #now let's rank by book_predict
    b=sort_rows_by_icol_increment(b,0)

    matrix_test=np.array(b[:,0:4])

#print matrix_test.shape
    print NDCG.NDCG(matrix)
    print NDCG.NDCG(matrix_test)

Exemplo n.º 10
0
#X_train_click, X_test_click, y_train_click, y_test_click = cross_validation.train_test_split(X, y_click, test_size=0.4, random_state=0)

from sklearn.ensemble import RandomForestClassifier
rf_book = RandomForestClassifier(n_estimators=20,
                                 max_features=51,
                                 max_depth=10)
rf_book.fit(X, y_book)
book_predict = rf_book.predict_proba(X)
#prob1=np.array([x_srch_id,x_hotel_id,book_predict[:,1]])
#prob1=np.array(map(list, zip(*prob1)))

#print a.shape
rf_click = RandomForestClassifier(n_estimators=20,
                                  max_features=51,
                                  max_depth=10)
rf_click.fit(X, y_click)
click_predict = rf_click.predict_proba(X)
prob = np.array([
    x_srch_id, x_hotel_id, y_click, y_book, click_predict[:, 1],
    book_predict[:, 1]
])
prob = np.array(map(list, zip(*prob)))

a = sort_rows_by_icol_descendent(prob, 4)  #first let's rank by click_predict
a = sort_rows_by_icol_descendent(a, 5)  #now let's rank by book_predict
a = sort_rows_by_icol_increment(a, 0)

matrix = np.array(a[:, 0:4])

print NDCG.NDCG(matrix)
Exemplo n.º 11
0
del df_sessions
del device_freq
del action_freq

#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)                  

print('scores:', NDCG.cross_validation_score(X, labels,xgb,5))
'''
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)
Exemplo n.º 12
0
import pandas as pd
from xgboost.sklearn import XGBClassifier
import NDCG

df_train = pd.read_csv('./input/train_users.csv')
# data output
truth = df_train['country_destination'].values
# format the data 
df_all = df_train.drop(['id', 'date_first_booking','date_account_created','timestamp_first_active','age','country_destination'], axis=1)
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)
# data input
preds = df_all.values
# model
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
# call validation
print NDCG.cross_validation_score(preds,truth,xgb,3)
Exemplo n.º 13
0
def computeNDCG_LR(test_path,base_path):
    s = parse_test_session(test_path)
    ndcg = 0
    num = 0
    with open(base_path,'r',newline='') as base:
        for line in base:
            baseline = json.loads(line)
            base_url_list = baseline['url_list']
            base_serp = baseline['serp']
            base_dict = make_dict(base_url_list)
            session = s.__next__()
            temp_url_list = []
            if session['session_id'] == baseline['session_id']:
                url_dict = collections.defaultdict(int)
                domain_dict = collections.defaultdict(int)
                for query in session['query_list']:
                    q_url_list = query['url_list']
                    serp_now = query['serp']
                    if serp_now != base_serp:
                        for i in range(len(q_url_list)):
                            rel,url,domain = q_url_list[i][0],q_url_list[i][1],q_url_list[i][2]
                            if rel > 1:
                                url_score = compute_reward(rel,i,0.6,base_serp-serp_now)
                                if url_score > url_dict[url]:
                                    url_dict[url] = url_score
                                domain_score = compute_reward(rel,i,0.6,base_serp-serp_now)
                                if domain_score > domain_dict[domain]:
                                    domain_dict[domain] = domain_score
                for idx in range(len(base_url_list)):
                    base = 1/math.log(idx+2,2)
                    if idx>=5:
                        base = base/(idx)
                    # base = base/(idx+1)
                    d_reward = domain_dict[base_url_list[idx][2]]
                    u_reward = url_dict[base_url_list[idx][1]]
                    if d_reward > 0 and u_reward == 0:
                        reward = d_reward*0.6
                    elif u_reward > 0:
                        reward = u_reward*0.1
                    else:
                        reward = 0
                    temp_url_list.append( [base+reward/2,base_url_list[idx][1]] )

                temp_url_list.sort(reverse=True)

                for url in temp_url_list:
                    url[0] = base_dict[url[1]]
                # if int(session['session_id']) == 27558:
                #     print(session)
                #     print('new:',temp_url_list)
                #     print('old:',base_url_list)
                ndcg+=NDCG.computeNDCG(temp_url_list)
                num+=1
            else:
                print('error,not found corresponding base or session')
                break
    print('Local Reward:',ndcg/num)
    return  ndcg/num



# computeNDCG_LR(gl.test_sample_url_domain,gl.baseline_sample_url_domain)
# test_local_method_proportion(gl.test_clean_sample,gl.baseline_clean_sample)
Exemplo n.º 14
0
        id_lat_lng_table.append([])

    for cnt1 in range(len(id_lat_lng_table)):
        for cnt2 in range(3):
            id_lat_lng_table[cnt1].append(dataReadIn1[cnt1][cnt2])
        id_lat_lng_table[cnt1].append(0)

    for cnt in range(len(id_lat_lng_table)):
        id_lat_lng_table[cnt][1] = float(id_lat_lng_table[cnt][1])
        id_lat_lng_table[cnt][2] = float(id_lat_lng_table[cnt][2])
    t3 = time.time()
    print("Time for constructing table : " + str(t3 - t2) + "sec")

    Checkin_id = []

    Checkins_data_1 = NDCG.readfile("dataset_TIST2015_Checkins_part1.txt")
    for cnt in range(len(Checkins_data_1)):
        Checkin_id.append(Checkins_data_1[cnt][1])
    Checkins_data_1.clear()

    Checkins_data_2 = NDCG.readfile("dataset_TIST2015_Checkins_part2.txt")
    for cnt in range(len(Checkins_data_2)):
        Checkin_id.append(Checkins_data_2[cnt][1])
    Checkins_data_2.clear()

    Checkins_data_3 = NDCG.readfile("dataset_TIST2015_Checkins_part3.txt")
    for cnt in range(len(Checkins_data_3)):
        Checkin_id.append(Checkins_data_3[cnt][1])
    Checkins_data_3.clear()

    Checkins_data_4 = NDCG.readfile("dataset_TIST2015_Checkins_part4.txt")
Exemplo n.º 15
0
def eval():
    parsedQuery = None
    parsedText = None
    file = "data/cisi/cisi"
    for path in ["./", "../"]:
        try:
            parsedQuery = queryParser.parse(path + file)
            parsedText = myParser.buildDocCollectionSimple(path + file + ".txt", ".W")
            break
        except FileNotFoundError:
            pass

    indexer = indexerSimple.IndexerSimple(parsedText.docs)

    models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5]
    models = [clas(indexer) for clas in models]
    models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models]
    jelinek = jelinekMercer.JelinekMercer(indexer)
    models.append(jelinek)

    okapi = okapiBM25.OkapiBM25(indexer)
    models.append(okapi)

    data_fit = [q.T for q in parsedQuery.queries.values()]
    labels = [q.pertient_list_id for q in parsedQuery.queries.values()]

    print("fit")
    # jelinek.fit(np.linspace(0, 2, 2), data_fit, labels)
    # okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit, labels)

    for i in range(len(models)):
        models.append(pagerank.PagerankMarcheAlea(indexer, models[i]))

    k = 9
    metrics = [
        averagePrecision.AveragePrecision(),
        precisionAtK.PrecisionAtK(k),
        fMesureK.FMesureK(1, k),
        rappelAtK.RappelAtK(k),
        NDCG.NDCG(k),
        reciprocalRank.ReciprocalRank()]

    perf = []
    print(models)
    print(metrics)
    for i, model in enumerate(models):
        print(i,"/", len(models))
        perf.append([])
        pred = [model.getRanking(data_fit[k]) for k in range(len(data_fit))]

        for metric in metrics:
            score, std = metric.eval_list_query(pred, labels)
            perf[-1].append(score)
        print([round(x, 4) for x in perf[-1]])

    import matplotlib.pyplot as plt
    plt.imshow(perf)
    plt.colorbar()
    plt.xlabel("Metrique")
    plt.ylabel("Modèle")
    plt.figtext(0,0,"Metriques : 0 : averagePrecision,1 : precisionAtK,2 : fMesureK,3 : rappelAtK,4 : NDCG,5 : reciprocalRank;Modèles : 0-4: Vectoriel, 5 : jelinekMercer,6 : okapiBM25, 7-12 : avec pagerank")
    plt.show()
Exemplo n.º 16
0
def main():
    try:
        db = DBRepository()
        flag = True
        while flag:
            while True:
                clear_screen()
                puts(
                    colored.green('SEARCHING ENGINE') +
                    colored.clean('\n -by Three Legged Spider'))
                space(1)
                raw_user_query = raw_input(
                    colored.yellow("Enter your query: "))
                if len(raw_user_query) >= 1:
                    break
                else:
                    print('Enter valid query')
                    time.sleep(0.5)
                    continue

            user_query = query_preprocessor(raw_user_query)
            # print(user_query)
            common_docs, unique_docs = db.search_double_query(user_query)

            # print(common_docs)
            # space(3)
            # print(unique_docs)
            if common_docs == 0:
                space(1)
                print('No results found')
                space(2)
                exit_choice(flag)
            elif common_docs != 0:
                # print(common_docs)
                proximity_dict = {}
                if len(user_query) > 1:
                    proximity_dict = db.get_proximity(list(common_docs),
                                                      user_query)
                    # print(docs_dict)

                    proximity_dict = proximity(list(common_docs),
                                               proximity_dict)
                    sorted(proximity_dict,
                           key=lambda x: (proximity_dict[x]['final_score']),
                           reverse=True)

                    common_docs = set(proximity_dict.keys())

                final_document_list, final_url_list = final_list_creation(
                    common_docs, unique_docs, 5)

                if len(user_query) == 1:
                    position_dict = db.position_for_one_word(
                        list(final_document_list), user_query[0])
                else:
                    unique_doc_positions1 = db.position_for_one_word(
                        list(final_document_list), user_query[0])
                    unique_doc_positions2 = db.position_for_one_word(
                        list(final_document_list), user_query[1])

                text_snippet = []

                for doc in list(final_document_list):
                    text_snippet.append(get_snippet(doc, user_query))

                # if len(user_query)==1:
                #     for doc in list(final_document_list):
                #         text_snippet.append(get_snippet(doc, position_dict[doc]['pos_w1']))
                # else:
                #     for doc in list(final_document_list):
                #         # space()
                #         # print(doc)
                #         if doc in common_docs:
                #             # print('in common')
                #             print(min(proximity_dict[doc]['pos_w1']), min(proximity_dict[doc]['pos_w2']))
                #             text_snippet.append(get_snippet(doc, min(proximity_dict[doc]['pos_w1']), \
                #                 min(proximity_dict[doc]['pos_w2'])))
                #         elif doc in unique_docs:
                #             # print('in unique')

                #             if doc in unique_doc_positions1.keys():
                #                 print(unique_doc_positions1[doc]['pos_w1'])
                #                 a = get_snippet(doc, unique_doc_positions1[doc]['pos_w1'])
                #                 text_snippet.append(a)
                #             elif doc in unique_doc_positions2.keys():
                #                 print(unique_doc_positions2[doc]['pos_w1'])
                #                 a = get_snippet(doc, unique_doc_positions2[doc]['pos_w1'])
                #                 text_snippet.append(a)
                #             else:
                #                 print('Something is wrong')

                print(colored.cyan('Google Results: '))
                output_string(google_search(user_query))
                space(1)
                print(colored.cyan("Search Results:"))
                output_string(final_url_list, text_snippet)
                # output_string(final_url_list)
                space(1)

                # weight_list = db.weigths_of_the_output(final_document_list, user_query)
                # print(weight_list)

                ground_truth = [
                    int(x) for x in raw_input(
                        'List of relevance score for each result by Google seperated by space: '
                    ).split()
                ]
                relevance_score = [
                    int(x) for x in raw_input(
                        'List of relevance score for each results seperated by space: '
                    ).split()
                ]

                N_list = NDCG.ndcg_at_k(relevance_score, 5)
                N_list_with_ground_truth = NDCG.ndcg_score(ground_truth,
                                                           relevance_score,
                                                           k=5)

                space(1)
                print('NDCG without Ground Truth: ' + str(N_list))
                print('NDCG with Ground Truth: ' +
                      str(N_list_with_ground_truth))

                space(1)
                exit_choice(flag)

    except (KeyboardInterrupt, SystemExit):
        print('Exit by User')