def computeNDCG_GR(base_path,result_path): total = 0 num = 0 idx = 0 low = 0 base_list = parse_baseline(base_path) session_list = session_GR(result_path) for session in session_list: if base_list[idx]['session_id'] == session[0]: dict = {} lists = [] for url1 in base_list[idx]['url_list']: dict[int(url1[1])] = url1[0] for k in range(len(session[1])): score = dict[int(session[1][k][1])] # if score == 0: # score =0.4/math.log(k+2,2) lists.append( (score,session[1][k][1]) ) # a,b = computeNDCG(lists),computeNDCG(base_list[idx]['url_list']) if NDCG.computeNDCG(base_list[idx]['url_list']) > NDCG.computeNDCG(lists): # if (computeNDCG(base_list[idx]['url_list']) - computeNDCG(lists))>0.4: # print('new: ',computeNDCG(lists),session[1]) # print('old: ',computeNDCG(base_list[idx]['url_list']),base_list[idx]['url_list']) low+=1 total += NDCG.computeNDCG(lists) num+=1 else: print('not find corresponding session') break idx+=1 print ('Global Reward:',total/num) return total/num
def scoring(self): # ベストモデルを使用した時のNDCGを計算 mod = super().grid_search() pred = mod.predict(self.X_test) print('NDCG:', NDCG.ndcg2(self.y_test.action_type.values, pred, k=20)) print('Learning Done') return mod
def compute_total_reward(test_path,result_path,base_path,parameter): gr = session_GR(result_path,parameter) lr = session_LR(test_path,base_path,parameter) ndcg = 0 base_ndcg = 0#test num = 0 with open(base_path,'r',newline='') as base: for line in base: # num+=1 flag = False #test baseline = json.loads(line) base_url_list = baseline['url_list'] # print(base_url_list) global_reward = gr.__next__() local_reward = lr.__next__() new_url_list = [] if int(global_reward[0]) == int(local_reward[0]): gr_list = global_reward[1] lr_list = local_reward[1] for lr_item in lr_list: #test if lr_item[0] > 0: flag = True break for i in range(len(gr_list)): if int(gr_list[i][1]) == int(lr_list[i][1]): # score = (gr_list[i][0])*0.7+lr_list[i][0]*0.3*0.51 score = (gr_list[i][0])*parameter['bs']\ +lr_list[i][0]*(1-parameter['bs'])\ *((global_reward[2]*parameter['s_rel'])+((1-global_reward[2])*parameter['s_irel'])) new_url_list.append([round(score,6),int(lr_list[i][1])]) new_url_list.sort(reverse = True) # print(new_url_list) final_url_list = recover_rel(base_url_list,new_url_list) # if NDCG.computeNDCG(final_url_list) < 1: # if flag:#test # print('gr',global_reward) # print('lr',local_reward) # print('final',final_url_list) # print(base_url_list) # num+=1 # ndcg+=NDCG.computeNDCG(final_url_list) # base_ndcg+=NDCG.computeNDCG(base_url_list) num+=1 ndcg+=NDCG.computeNDCG(final_url_list) # if NDCG.computeNDCG(final_url_list) < 0.5: # print('gr',global_reward) # print('lr',local_reward) # print('final',final_url_list) # print(base_url_list) print('Total Reward:',ndcg/num)#test return (ndcg/num)
def sample(test_path,base_path): s = parse_test_session(test_path) a,b=0,0 with open(base_path,'r',newline='') as base: for line in base: baseline = json.loads(line) base_url_list = baseline['url_list'] base_serp = baseline['serp'] base_dict = make_dict(base_url_list) session = s.__next__() temp_url_list = [] if session['session_id'] == baseline['session_id']: dict = collections.defaultdict(int) for query in session['query_list']: q_url_list = query['url_list'] serp_now = query['serp'] if serp_now != base_serp: for i in range(len(q_url_list)): if q_url_list[i][0] > 1: score = ((q_url_list[i][0])-1)/math.log(i+2,2) score = (score*math.pow(0.8,(base_serp-serp_now)))/2 if score > dict[q_url_list[i][1]]: dict[q_url_list[i][1]] = score for idx in range(len(base_url_list)): base = 1/math.log(idx+2,2) temp_url_list.append( [round(base+dict[base_url_list[idx][1]],2),base_url_list[idx][1]] ) temp_url_list.sort(reverse=True) for url in temp_url_list: url[0] = base_dict[url[1]] new_ndcg = NDCG.computeNDCG(temp_url_list) old_ndcg = NDCG.computeNDCG(base_url_list) if new_ndcg > old_ndcg: # print(session) # print('new:',new_ndcg,temp_url_list) # print('old:',old_ndcg,base_url_list) a+=1 elif new_ndcg < old_ndcg: print(session) print(temp_url_list) b+=1 else: print('error,not found corresponding base or session') break print('a',a,'b',b)
def RANK(True_rank,UIP,UIT): Pred_rank = {} T_NDCG = [] for i in True_rank.keys(): if len(UIP[i])<5: continue if len(np.unique(UIT[i]))==1: continue Pred_rank[i] = True_rank[i][np.argsort(UIP[i])][::-1] T_NDCG.append( NDCG(Pred_rank[i]) ) #T_NDCG[i] = NDCG(Pred_rank[i]) print np.mean(T_NDCG) return T_NDCG
def test_local_method_proportion(test_path,base_path): s = parse_test_session(test_path) a,b = 0,0 b_ndcg = 0 a_ndcg = 0 total = 0 with open(base_path,'r',newline='') as base: for line in base: total+=1 baseline = json.loads(line) base_url_list = baseline['url_list'] base_serp = baseline['serp'] session = s.__next__() test_dict = collections.defaultdict(int) if session['session_id'] == baseline['session_id']: for query in session['query_list']: q_url_list = query['url_list'] serp_now = query['serp'] if serp_now != base_serp: for i in range(len(q_url_list)): if q_url_list[i][0] > 1: test_dict[q_url_list[i][1]]=q_url_list[i][0] for url in base_url_list: if test_dict[url[1]] > 1 and url[0] == 1: a+=1 a_ndcg+=NDCG.computeNDCG(base_url_list) print(session) # break elif url[0]>1 and test_dict[url[1]] > 1: b+=1 b_ndcg+=NDCG.computeNDCG(base_url_list) # break else: print('error,not found corresponding base or session') break print('a',a,a_ndcg/a,'b',b,b_ndcg/b,'total',total)
def ndcg_distribute(path): ndcg_list = [[i/10,0] for i in range(1,11)] with open(path,'r') as baseline: for line in baseline: url_list = json.loads(line)['url_list'] ndcg = NDCG.computeNDCG(url_list) for item in ndcg_list: if ndcg < item[0]: item[1]+=1 break print(ndcg_list) x = [i[0]for i in ndcg_list] y = [i[1]for i in ndcg_list] plt.bar(x, y,-0.1,color='y', edgecolor='g', linewidth=1, align='edge') plt.show()
def RANK(True_rank,UIP,UIT,N): Pred_rank = {} T_NDCG = {} USER = 0 for i in UIP.keys(): if len(UIP[i])<=N: continue if len(np.unique(UIT[i]))==1: continue Pred_rank[i] = True_rank[i][np.argsort(UIP[i])][::-1][:N] if len(np.unique(Pred_rank[i]))==1: continue T_NDCG[i] = NDCG(Pred_rank[i]) USER+=1 return T_NDCG
a=sort_rows_by_icol_descendent(prob,4) #first let's rank by click_predict a=sort_rows_by_icol_descendent(a,5) #now let's rank by book_predict a=sort_rows_by_icol_increment(a,0) matrix=np.array(a[:,0:4]) #print NDCG.NDCG(matrix) #for the test data book_predict_test=rf_book.predict_proba(X_test) click_predict_test=rf_click.predict_proba(X_test) #print len(book_predict_test) #print len(click_predict_test) prob_test=np.array([x_srch_id_test,x_hotel_id_test,y_click_test,y_book_test,click_predict_test[:,1],book_predict_test[:,1]]) prob_test=np.array(map(list, zip(*prob_test))) b=sort_rows_by_icol_descendent(prob_test,4) #first let's rank by click_predict b=sort_rows_by_icol_descendent(b,5) #now let's rank by book_predict b=sort_rows_by_icol_increment(b,0) matrix_test=np.array(b[:,0:4]) #print matrix_test.shape print NDCG.NDCG(matrix) print NDCG.NDCG(matrix_test)
#X_train_click, X_test_click, y_train_click, y_test_click = cross_validation.train_test_split(X, y_click, test_size=0.4, random_state=0) from sklearn.ensemble import RandomForestClassifier rf_book = RandomForestClassifier(n_estimators=20, max_features=51, max_depth=10) rf_book.fit(X, y_book) book_predict = rf_book.predict_proba(X) #prob1=np.array([x_srch_id,x_hotel_id,book_predict[:,1]]) #prob1=np.array(map(list, zip(*prob1))) #print a.shape rf_click = RandomForestClassifier(n_estimators=20, max_features=51, max_depth=10) rf_click.fit(X, y_click) click_predict = rf_click.predict_proba(X) prob = np.array([ x_srch_id, x_hotel_id, y_click, y_book, click_predict[:, 1], book_predict[:, 1] ]) prob = np.array(map(list, zip(*prob))) a = sort_rows_by_icol_descendent(prob, 4) #first let's rank by click_predict a = sort_rows_by_icol_descendent(a, 5) #now let's rank by book_predict a = sort_rows_by_icol_increment(a, 0) matrix = np.array(a[:, 0:4]) print NDCG.NDCG(matrix)
del df_sessions del device_freq del action_freq #Splitting train and test vals = df_all.values X = vals[:piv_train] le = LabelEncoder() y = le.fit_transform(labels) X_test = vals[piv_train:] #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43, objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0) print('scores:', NDCG.cross_validation_score(X, labels,xgb,5)) ''' xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) #Taking the 5 classes with highest probabilities ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() #Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv('sub.csv',index=False)
import pandas as pd from xgboost.sklearn import XGBClassifier import NDCG df_train = pd.read_csv('./input/train_users.csv') # data output truth = df_train['country_destination'].values # format the data df_all = df_train.drop(['id', 'date_first_booking','date_account_created','timestamp_first_active','age','country_destination'], axis=1) ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] for f in ohe_feats: df_all_dummy = pd.get_dummies(df_all[f], prefix=f) df_all = df_all.drop([f], axis=1) df_all = pd.concat((df_all, df_all_dummy), axis=1) # data input preds = df_all.values # model xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) # call validation print NDCG.cross_validation_score(preds,truth,xgb,3)
def computeNDCG_LR(test_path,base_path): s = parse_test_session(test_path) ndcg = 0 num = 0 with open(base_path,'r',newline='') as base: for line in base: baseline = json.loads(line) base_url_list = baseline['url_list'] base_serp = baseline['serp'] base_dict = make_dict(base_url_list) session = s.__next__() temp_url_list = [] if session['session_id'] == baseline['session_id']: url_dict = collections.defaultdict(int) domain_dict = collections.defaultdict(int) for query in session['query_list']: q_url_list = query['url_list'] serp_now = query['serp'] if serp_now != base_serp: for i in range(len(q_url_list)): rel,url,domain = q_url_list[i][0],q_url_list[i][1],q_url_list[i][2] if rel > 1: url_score = compute_reward(rel,i,0.6,base_serp-serp_now) if url_score > url_dict[url]: url_dict[url] = url_score domain_score = compute_reward(rel,i,0.6,base_serp-serp_now) if domain_score > domain_dict[domain]: domain_dict[domain] = domain_score for idx in range(len(base_url_list)): base = 1/math.log(idx+2,2) if idx>=5: base = base/(idx) # base = base/(idx+1) d_reward = domain_dict[base_url_list[idx][2]] u_reward = url_dict[base_url_list[idx][1]] if d_reward > 0 and u_reward == 0: reward = d_reward*0.6 elif u_reward > 0: reward = u_reward*0.1 else: reward = 0 temp_url_list.append( [base+reward/2,base_url_list[idx][1]] ) temp_url_list.sort(reverse=True) for url in temp_url_list: url[0] = base_dict[url[1]] # if int(session['session_id']) == 27558: # print(session) # print('new:',temp_url_list) # print('old:',base_url_list) ndcg+=NDCG.computeNDCG(temp_url_list) num+=1 else: print('error,not found corresponding base or session') break print('Local Reward:',ndcg/num) return ndcg/num # computeNDCG_LR(gl.test_sample_url_domain,gl.baseline_sample_url_domain) # test_local_method_proportion(gl.test_clean_sample,gl.baseline_clean_sample)
id_lat_lng_table.append([]) for cnt1 in range(len(id_lat_lng_table)): for cnt2 in range(3): id_lat_lng_table[cnt1].append(dataReadIn1[cnt1][cnt2]) id_lat_lng_table[cnt1].append(0) for cnt in range(len(id_lat_lng_table)): id_lat_lng_table[cnt][1] = float(id_lat_lng_table[cnt][1]) id_lat_lng_table[cnt][2] = float(id_lat_lng_table[cnt][2]) t3 = time.time() print("Time for constructing table : " + str(t3 - t2) + "sec") Checkin_id = [] Checkins_data_1 = NDCG.readfile("dataset_TIST2015_Checkins_part1.txt") for cnt in range(len(Checkins_data_1)): Checkin_id.append(Checkins_data_1[cnt][1]) Checkins_data_1.clear() Checkins_data_2 = NDCG.readfile("dataset_TIST2015_Checkins_part2.txt") for cnt in range(len(Checkins_data_2)): Checkin_id.append(Checkins_data_2[cnt][1]) Checkins_data_2.clear() Checkins_data_3 = NDCG.readfile("dataset_TIST2015_Checkins_part3.txt") for cnt in range(len(Checkins_data_3)): Checkin_id.append(Checkins_data_3[cnt][1]) Checkins_data_3.clear() Checkins_data_4 = NDCG.readfile("dataset_TIST2015_Checkins_part4.txt")
def eval(): parsedQuery = None parsedText = None file = "data/cisi/cisi" for path in ["./", "../"]: try: parsedQuery = queryParser.parse(path + file) parsedText = myParser.buildDocCollectionSimple(path + file + ".txt", ".W") break except FileNotFoundError: pass indexer = indexerSimple.IndexerSimple(parsedText.docs) models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5] models = [clas(indexer) for clas in models] models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models] jelinek = jelinekMercer.JelinekMercer(indexer) models.append(jelinek) okapi = okapiBM25.OkapiBM25(indexer) models.append(okapi) data_fit = [q.T for q in parsedQuery.queries.values()] labels = [q.pertient_list_id for q in parsedQuery.queries.values()] print("fit") # jelinek.fit(np.linspace(0, 2, 2), data_fit, labels) # okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit, labels) for i in range(len(models)): models.append(pagerank.PagerankMarcheAlea(indexer, models[i])) k = 9 metrics = [ averagePrecision.AveragePrecision(), precisionAtK.PrecisionAtK(k), fMesureK.FMesureK(1, k), rappelAtK.RappelAtK(k), NDCG.NDCG(k), reciprocalRank.ReciprocalRank()] perf = [] print(models) print(metrics) for i, model in enumerate(models): print(i,"/", len(models)) perf.append([]) pred = [model.getRanking(data_fit[k]) for k in range(len(data_fit))] for metric in metrics: score, std = metric.eval_list_query(pred, labels) perf[-1].append(score) print([round(x, 4) for x in perf[-1]]) import matplotlib.pyplot as plt plt.imshow(perf) plt.colorbar() plt.xlabel("Metrique") plt.ylabel("Modèle") plt.figtext(0,0,"Metriques : 0 : averagePrecision,1 : precisionAtK,2 : fMesureK,3 : rappelAtK,4 : NDCG,5 : reciprocalRank;Modèles : 0-4: Vectoriel, 5 : jelinekMercer,6 : okapiBM25, 7-12 : avec pagerank") plt.show()
def main(): try: db = DBRepository() flag = True while flag: while True: clear_screen() puts( colored.green('SEARCHING ENGINE') + colored.clean('\n -by Three Legged Spider')) space(1) raw_user_query = raw_input( colored.yellow("Enter your query: ")) if len(raw_user_query) >= 1: break else: print('Enter valid query') time.sleep(0.5) continue user_query = query_preprocessor(raw_user_query) # print(user_query) common_docs, unique_docs = db.search_double_query(user_query) # print(common_docs) # space(3) # print(unique_docs) if common_docs == 0: space(1) print('No results found') space(2) exit_choice(flag) elif common_docs != 0: # print(common_docs) proximity_dict = {} if len(user_query) > 1: proximity_dict = db.get_proximity(list(common_docs), user_query) # print(docs_dict) proximity_dict = proximity(list(common_docs), proximity_dict) sorted(proximity_dict, key=lambda x: (proximity_dict[x]['final_score']), reverse=True) common_docs = set(proximity_dict.keys()) final_document_list, final_url_list = final_list_creation( common_docs, unique_docs, 5) if len(user_query) == 1: position_dict = db.position_for_one_word( list(final_document_list), user_query[0]) else: unique_doc_positions1 = db.position_for_one_word( list(final_document_list), user_query[0]) unique_doc_positions2 = db.position_for_one_word( list(final_document_list), user_query[1]) text_snippet = [] for doc in list(final_document_list): text_snippet.append(get_snippet(doc, user_query)) # if len(user_query)==1: # for doc in list(final_document_list): # text_snippet.append(get_snippet(doc, position_dict[doc]['pos_w1'])) # else: # for doc in list(final_document_list): # # space() # # print(doc) # if doc in common_docs: # # print('in common') # print(min(proximity_dict[doc]['pos_w1']), min(proximity_dict[doc]['pos_w2'])) # text_snippet.append(get_snippet(doc, min(proximity_dict[doc]['pos_w1']), \ # min(proximity_dict[doc]['pos_w2']))) # elif doc in unique_docs: # # print('in unique') # if doc in unique_doc_positions1.keys(): # print(unique_doc_positions1[doc]['pos_w1']) # a = get_snippet(doc, unique_doc_positions1[doc]['pos_w1']) # text_snippet.append(a) # elif doc in unique_doc_positions2.keys(): # print(unique_doc_positions2[doc]['pos_w1']) # a = get_snippet(doc, unique_doc_positions2[doc]['pos_w1']) # text_snippet.append(a) # else: # print('Something is wrong') print(colored.cyan('Google Results: ')) output_string(google_search(user_query)) space(1) print(colored.cyan("Search Results:")) output_string(final_url_list, text_snippet) # output_string(final_url_list) space(1) # weight_list = db.weigths_of_the_output(final_document_list, user_query) # print(weight_list) ground_truth = [ int(x) for x in raw_input( 'List of relevance score for each result by Google seperated by space: ' ).split() ] relevance_score = [ int(x) for x in raw_input( 'List of relevance score for each results seperated by space: ' ).split() ] N_list = NDCG.ndcg_at_k(relevance_score, 5) N_list_with_ground_truth = NDCG.ndcg_score(ground_truth, relevance_score, k=5) space(1) print('NDCG without Ground Truth: ' + str(N_list)) print('NDCG with Ground Truth: ' + str(N_list_with_ground_truth)) space(1) exit_choice(flag) except (KeyboardInterrupt, SystemExit): print('Exit by User')