def test_ranking(filename): vectors = load_vectors(filename) normalize(vectors) with codecs.open("data/rank.txt", "r", encoding="utf-8") as source: lines = [line.strip().split() for line in source.readlines()] actual = [line[1:] for line in lines] predicted = [[x[0] for x in nearest(vectors, line[0], normal=True)] for line in lines] print ml_metrics.mapk(actual, predicted, k=100)
def test_mapk(self): self.assertAlmostEqual(metrics.mapk([range(1,5)],[range(1,5)],3), 1.0) self.assertAlmostEqual(metrics.mapk([[1,3,4],[1,2,4],[1,3]], [range(1,6),range(1,6),range(1,6)], 3), 0.685185185185185) self.assertAlmostEqual(metrics.mapk([range(1,6),range(1,6)], [[6,4,7,1,2],[1,1,1,1,1]], 5), 0.26) self.assertAlmostEqual(metrics.mapk([[1,3],[1,2,3],[1,2,3]], [range(1,6),[1,1,1],[1,2,1]], 3), 11.0/18)
def test_ranking_2(filename): vectors = load_vectors(filename) normalize(vectors) with codecs.open("data/rank.txt", "r", encoding="utf-8") as source: lines = [line.strip().split() for line in source.readlines()] vocab = set([word for line in lines for word in line]) vectors = {word: vector for word, vector in vectors.iteritems() if word in vocab} actual = [line[1:] for line in lines] predicted = [[x[0] for x in nearest(vectors, line[0], normal=True)] for line in lines] print ml_metrics.mapk(actual, predicted, k=100)
def test_mapk(self): self.assertAlmostEqual(metrics.mapk([range(1, 5)], [range(1, 5)], 3), 1.0) print metrics.mapk( [[1, 3, 4], [1, 2, 4], [1, 3]], [range(1, 6), range(1, 6), range(1, 6)], 5) #, 0.685185185185185) self.assertAlmostEqual( metrics.mapk([range(1, 6), range(1, 6)], [[6, 4, 7, 1, 2], [1, 1, 1, 1, 1]], 5), 0.26) self.assertAlmostEqual( metrics.mapk([[1, 3], [1, 2, 3], [1, 2, 3]], [range(1, 6), [1, 1, 1], [1, 2, 1]], 3), 11.0 / 18)
def load_paper_pair(path): paper_size = 60 end = paper_size * len(candidate_ids) batch_x = np.load(path[0], allow_pickle=True)[:end] batch_ans = np.load(path[1], allow_pickle=True)[:end].tolist() batch_pred = make_prediction(batch_x, paper_size).tolist() print(metrics.mapk(batch_ans, batch_pred, 150))
def map_brain(dataset, n_rows, labels=[], save_name=None): map_k = 3 print('There are %d rows' % n_rows) data_coor_sigma = dataset[['x', 'y', 'accuracy']].values closest_3places_ids = [] closest_3places_ids_str = np.zeros((dataset.shape[0],)).astype(object) for i, cur_coor in enumerate(data_coor_sigma[:n_rows]): if not i % 1000: print('row %d' % i) metric_results = (((places_x - cur_coor[0]) / places_x_sd) ** 2 + ((places_y - cur_coor[1]) / places_y_sd) ** 2) / places_freq ranked_ids = argsort_short(metric_results, map_k) cur_places = [] for place_id in ranked_ids: cur_places.append(places_ID[place_id]) closest_3places_ids.append(cur_places) closest_3places_ids_str[i] = ' '.join(map(lambda x: str(x), cur_places)) # For submission if len(labels): print('The MAP3 score is %f' % mapk(labels, closest_3places_ids, map_k)) if save_name: submission = pd.DataFrame.from_csv('sample_submission.csv') submission['place_id'] = closest_3places_ids_str submission.to_csv(save_name)
def run_linear_regression(usage, thefts, theft_usage, day_num, apt_num, noise): daily_thefts = convert_theft_to_daily(thefts, apt_num, day_num) caught_total = 0 caught_thefts_total = 0 total_pvalues = [] total_coeffs = [] for day in xrange(day_num): caught, caught_theft, daily_pvalue, daily_coef = compute_precision_recall( daily_thefts[day], theft_usage[day], usage[day], noise=noise) caught_total += caught caught_thefts_total += caught_theft total_pvalues.extend(daily_pvalue) total_coeffs.extend(daily_coef) print "Summary" print "Theft {}. Caught {}. Correct {}".format(len(thefts), caught_total, caught_thefts_total) print "map@k" indexes = [i for i, v in enumerate(total_pvalues) if v < 1] pred_thievesid = [i for i, v in enumerate(total_coeffs) if v > 0.1] sorted_houses = np.argsort(total_coeffs) refined_sorted_houses = [x for x in sorted_houses if x in set(indexes)] refined_sorted_houses = [ x for x in refined_sorted_houses if x in set(pred_thievesid) ] for i in range(1, 11): mapk = metrics.mapk( convert_theft_to_daily(thefts, cfg.Apts, cfg.Lr_Days), convert_theft_to_daily(refined_sorted_houses, cfg.Apts, cfg.Lr_Days), i) print "{}: {}".format(i, mapk)
def comparing_with_ground_truth(tops, txt_infos, k): utils.dump_pickle("result.pkl", tops) gt = utils.get_pickle("datasets/qst1_w4/gt_corresps.pkl") hypo = utils.get_pickle("result.pkl") mapAtK = metrics.mapk(gt, hypo, k) print("\nMap@ " + str(k) + " is " + str(mapAtK)) bbs_gt = np.asarray( utils.get_groundtruth("datasets/qst1_w4/text_boxes.pkl")).squeeze() bbs_predicted = [[painting.boundingxy for painting in txt_info] for txt_info in txt_infos] mean_iou = utils.get_mean_IoU(bbs_gt, bbs_predicted) print("Mean Intersection over Union: ", mean_iou) texts_gt = utils.get_gt_text("datasets/qst1_w4") texts_predicted = [[painting.text for painting in txt_info] for txt_info in txt_infos] with open('results.txt', 'w') as f: for item in texts_predicted: f.write("%s\n" % item) mean_lev = utils.compute_lev(texts_gt, texts_predicted) print(texts_predicted) print("\n") print(texts_gt) print("Mean Levenshtein distance: ", mean_lev)
def validate(model, dataset, make_batch_fn=make_batch): """Validates the model Args: model (nn.Module): The model to be validated dataset (DrawingLoader): The validation dataset make_batch_fn (function, optional): Defaults to make_batch. The function for making batches. Returns: float: The MAP@3 validation score """ model.eval() actual = [] predicted = [] log("Calculating MAP@3 for the validation dataset...") for batch in tqdm(dataset, desc="Validating", file=sys.stdout): inputs, targets, _, _ = make_batch_fn(batch) predicted.append(model.predict(inputs)) actual.append(targets.data.cpu().numpy()) actual = np.concatenate(actual) actual = [[x] for x in actual] predicted = np.concatenate(predicted, axis=0) map_3 = metrics.mapk(actual, predicted, 3) log("Validation MAP@3: {}".format(map_3)) log("--------------------------------------------------------") return map_3
def main(): #K parameter for map@k k = 1 # Get images and denoise query set. print("Reading images...") qs = get_imgs("datasets/qsd1_w4") db = get_imgs("datasets/DDBB") """ Denoising methods "Gaussian" "Median" "bilateral" "FastNl" """ print("Denoising images...") #qs_denoised = [utils.denoise_image(img, method="FastNl") for img in tqdm(qs)] #Separating paitings inside images to separate images qs_split = [background_remover.remove_background(img) for img in qs] print("\nComputing histograms...") hogs_qs = [[utils.get_hog_histogram(painting) for painting in img] for img in qs_split] hogs_ddbb = utils.get_hog_histograms(db) print("\nComputing distances") distances = [] #Generating distances between qs images and db images for im in tqdm(hogs_qs): current_im = [] for painting_hog in im: current_pt = [] for db_hog in hogs_ddbb: current_pt.append(sum(np.abs(painting_hog - db_hog))) current_im.append(current_pt) distances.append(current_im) print("Done calculating hogs") #Generating predictions predictions = [] for im in distances: current_im = [] for painting_dst in im: current_im.append(utils.list_argsort(painting_dst)[:k]) predictions.append(current_im) #Remove nesting of lists hypo = [] for im in predictions: current_im = [] for painting in im: for pred in painting: current_im.append(pred) hypo.append(current_im) #Generate map@k gt = utils.get_pickle("datasets/qsd1_w4/gt_corresps.pkl") mapAtK = metrics.mapk(gt, hypo, k) print("\nMap@ " + str(k) + " is " + str(mapAtK))
def test_data2x2(self): X, y = data2x2(100) estimator = KNeighborsClassifier(n_neighbors=1) grid = grid2d.Grid2d([0, 0.5, 1.0], [0, 0.5, 1.0], estimator) grid.fit(X, y) pred = grid.predict(X) self.assertTrue(np.array_equal(pred, y)) self.assertEqual(metrics.mapk(pred.reshape(-1, 1), np.array(y).reshape(-1, 1), 1), 1.0)
def metric_bar(hot, ansK, recommend, method='MAP', t='train', kList=(1, 5, 10, 25, 50, 100, 150)): for k in kList: if method == 'MAP': hot_metrics = metrics.mapk(ansK.tolist(), [hot]*ansK.shape[0], k) rs_metrics = metrics.mapk(ansK.tolist(), recommend.T.tolist(), k) plt.bar('Hot', hot_metrics) plt.bar('RS', rs_metrics) # if t == 'train': # plt.bar('GR', metrics.mapk(ansK.tolist(), graph_recommend_papers.T.tolist(), k)) if method == 'Recall': plt.bar('Hot', mark(ansK, [hot]*ansK.shape[0], k)) plt.bar('RS', mark(ansK, recommend.T, k)) # if t == 'train': # plt.bar('GR', mark(ansK.values, graph_recommend_papers.T, k)) plt.ylabel('score') plt.title(t+' '+method+'@'+str(k)) plt.show()
def validate(): train = pd.read_csv(os.path.join(HOME_DIR, "input/clicks_train.csv")) y = train[train.clicked == 1].ad_id.values y = [[_] for _ in y] predict = pd.read_csv(os.path.join(HOME_DIR, "output/sub_v2.csv")) p = [[row["ad_id"].split()] for index, row in predict.iterrows()] print(mapk(y, p, k=12))
def run_map_test(data, eventNames, users=None, primaryEvent=cfg.testing.primary_event, consider_non_zero_scores=cfg.testing.consider_non_zero_scores_only, num=200, K=cfg.testing.map_k, test=False, predictionio_url="http://0.0.0.0:8000"): N_TEST = 2000 d = {} res_data = {} engine_client = predictionio.EngineClient(url=predictionio_url) for rec in data: if rec.event == primaryEvent: user = rec.entityId item = rec.targetEntityId if (users is None) or (user in users): d.setdefault(user, []).append(item) if test: holdoutUsers = d.keys()[1:N_TEST] else: holdoutUsers = d.keys() prediction = [] ground_truth = [] user_items_cnt = 0.0 users_cnt = 0 for user in tqdm(holdoutUsers): q = { "user": user, "eventNames": eventNames, "num": num, } try: res = engine_client.send_query(q) # Sort by score then by item name tuples = sorted([(r["score"], r["item"]) for r in res["itemScores"]], reverse=True) scores = [score for score, item in tuples] items = [item for score, item in tuples] res_data[user] = { "items": items, "scores": scores, } # Consider only non-zero scores if consider_non_zero_scores: if len(scores) > 0 and scores[0] != 0.0: prediction.append(items) ground_truth.append(d.get(user, [])) user_items_cnt += len(d.get(user, [])) users_cnt += 1 else: prediction.append(items) ground_truth.append(d.get(user, [])) user_items_cnt += len(d.get(user, [])) users_cnt += 1 except predictionio.NotFoundError: print("Error with user: %s" % user) return ([metrics.mapk(ground_truth, prediction, k) for k in range(1, K + 1)], res_data, user_items_cnt / (users_cnt + 0.00001))
def run_map_test(data, eventNames, users=None, primaryEvent=cfg.testing.primary_event, consider_non_zero_scores=cfg.testing.consider_non_zero_scores_only, num=200, K=cfg.testing.map_k, test=False, predictionio_url="http://0.0.0.0:8000"): N_TEST = 2000 d = {} res_data = {} engine_client = predictionio.EngineClient(url=predictionio_url) for rec in data: if rec.event == primaryEvent: user = rec.entityId item = rec.targetEntityId if not users or user in users: d.setdefault(user, []).append(item) if test: holdoutUsers = [*d.keys()][1:N_TEST] else: holdoutUsers = [*d.keys()] prediction = [] ground_truth = [] user_items_cnt = 0.0 users_cnt = 0 for user in tqdm(holdoutUsers): q = { "user": user, "eventNames": eventNames, "num": num, } try: res = engine_client.send_query(q) # Sort by score then by item name tuples = sorted([(r["score"], r["item"]) for r in res["itemScores"]], reverse=True) scores = [score for score, item in tuples] items = [item for score, item in tuples] res_data[user] = { "items": items, "scores": scores, } # Consider only non-zero scores if consider_non_zero_scores: if len(scores) > 0 and scores[0] != 0.0: prediction.append(items) ground_truth.append(d.get(user, [])) user_items_cnt += len(d.get(user, [])) users_cnt += 1 else: prediction.append(items) ground_truth.append(d.get(user, [])) user_items_cnt += len(d.get(user, [])) users_cnt += 1 except predictionio.NotFoundError: print("Error with user: %s" % user) return ([metrics.mapk(ground_truth, prediction, k) for k in range(1, K + 1)], res_data, user_items_cnt / (users_cnt + 0.00001))
def map_5_scorer(estimator, X, y): prob = estimator.predict_proba(X) def top5(row): return sorted(range(len(row)), key=lambda k: row[k], reverse=True) y = map(lambda x: [x], y) y_pred = np.apply_along_axis(top5, axis=1, arr=prob) return mtr.mapk(y, y_pred, 5)
def random_search(train, user_hist, correct: dict, items_to_predict, num_samples: int = 20, num_threads: int = -1): """ Sample random hyperparameters, fit a LightFM model, and evaluate it on the test set. Parameters ---------- train: np.float32 coo_matrix Training data. correct: dict dict with keys as item and val is max score num_samples: int, optional Number of hyperparameter choices to evaluate. Returns ---------- generator of (auc_score, hyperparameter dict, fitted model) """ best_score = -1 best_params = {} for hyperparams in itertools.islice(sample_hyperparameters(), num_samples): start = datetime.now() print('hyperparams set:', hyperparams) num_epochs = hyperparams.pop("num_epochs") model = LightFM(**hyperparams) model.fit(train, epochs=num_epochs, num_threads=num_threads) recoms = {} num_to_recom = 5 for user in correct.keys(): items_to_score = list(items_to_predict.difference(user_hist[user])) predict = model.predict(user, items_to_predict, num_threads=num_threads) top_recoms_id = sorted(range(len(predict)), key=lambda i: predict[i])[-num_to_recom:] top_recoms_id.reverse() recoms[user_decode[user]] = [ item_decode[items_to_predict[i]] for i in top_recoms_id ] score = metrics.mapk(list(recoms.values()), list(correct_1.values()), 5) print(score) hyperparams["num_epochs"] = num_epochs end = datetime.now() yield (score, hyperparams, model, end - start)
def old_df2mapk(df): df_clicked = df[df.clicked == '1'][['display_id', 'ad_id']] df_clicked_gb_display_id = df_clicked.groupby('display_id')['ad_id'].apply( list) df_result = df.sort_values(['display_id','pred'], inplace=False, ascending=False) \ .groupby('display_id')['ad_id'] \ .apply(list) sr_answer, sr_pred = df_clicked_gb_display_id.align(df_result, join='inner') return mapk(sr_answer.tolist(), sr_pred.tolist(), k=12)
def eval_ranking(rel_test_topic, relevant_len, docs, model): # Precision, Recall precision, recall, docs_ids = precision_recall_rank( docs, rel_test_topic, relevant_len, model) relevant_retrieved_rank = [ doc for doc in docs_ids if doc in rel_test_topic ] # MAP map_results_rank = [] print("\nMAP:") for k in range(1, len(docs_ids)): map_results_rank.append(ml_metrics.mapk(rel_test_topic, docs_ids, k)) print(ml_metrics.mapk(rel_test_topic, docs_ids, k)) # BPREF bPref = bpref(docs_ids, rel_test_topic, relevant_len, relevant_retrieved_rank) return precision, recall, map_results_rank, bPref
def run_svm(theft_usage, theft_vectors): scaler = StandardScaler() training = [] training_label = [] for day in xrange(cfg.Svm_Train_Days): for apt in xrange(len(theft_usage[day])): training.append(theft_usage[day][apt]) index = day * cfg.Apts + apt if index in theft_vectors: training_label.append(1) else: training_label.append(0) training = np.asarray(training) training = scaler.fit_transform(training) training_label = np.asarray(training_label) testing = [] for i in xrange(cfg.Svm_Train_Days, cfg.Days): for apt in theft_usage[i]: testing.append(apt) testing = np.asarray(testing) testing = scaler.fit_transform(testing) clf = SVC(class_weight={1: 5}, random_state=0, probability=True) clf.fit(training, training_label) print "done fit" # For probabilities pred_probability = clf.predict_proba(testing)[:, 1] sorted_probas = pred_probability.argsort()[::-1] # descending order # For theft ids pred_results = clf.predict(testing) print "done predict" pre_number = cfg.Svm_Train_Days * cfg.Apts pred_theft = [(i + pre_number) for i in range(testing.shape[0]) if pred_results[i] == 1] total_theft = [i for i in theft_vectors if i >= pre_number] print "total theft, ", len(total_theft) print "detect, ", len(pred_theft) print "detect theft, ", len(set(pred_theft).intersection(total_theft)) # compute map@k pred_day_theft = convert_theft_to_daily( [(i + pre_number) for i in sorted_probas if pred_results[i] == 1], cfg.Apts, cfg.Days) total_day_theft = convert_theft_to_daily(total_theft, cfg.Apts, cfg.Days) # only take the last 70 days(those with theft), as empty list [] (1-349 day) will affect average value. for i in range(1, 11): mapk = metrics.mapk(total_day_theft[-70:], pred_day_theft[-70:], i) print "{}: {}".format(i, mapk)
def evaluate(y_true, y_pred): validate_predictions(y_true, y_pred) actual = [] predicted = [] for user_id in y_true.keys(): actual.append(y_true[user_id]) predicted.append(y_pred[user_id]) return metrics.mapk(actual, predicted, k=100)
def main(k1, b, k3, feature, query_feature, r): file_list = open(model_dir + "/file-list") file = file_list.read().split("\n") file.remove("") num_doc = len(file) file_dic = dict() for i in range(num_doc): term = file[i].split("/")[-1].lower() file_dic[term] = i inf_file = open(model_dir + "/inverted-file") inf = inf_file.read() del inf_file inf = inf.split("\n") inf.remove("") dictionaries, que_dic, select_voc, num_term, que = build_term_dic( model_dir, query_dir, num, k3=k3, F=feature, QF=query_feature) D = build_doc_vector(num_doc, dictionaries, inf, select_voc, k1=k1, b=b) #pca = PCA(n_components=int(num_term*0.9), svd_solver='full') #D_=pca.fit_transform(D) D_ = D if if_train: df = pd.read_csv("queries/ans_train.csv") truth = [] for i in range(num): app = [] ret = df["retrieved_docs"][i].split(" ") for ele in ret: app.append(file_dic[ele]) truth.append(app) w_file = open("score.txt", "a") for i in r: train_rank = output( que, num, num_doc, D_, r=i) #,feature=["title","concepts","question"]) for j in range(num): print( ml_metrics.apk(truth[j], train_rank[j, :100].tolist(), k=100)) score = ml_metrics.mapk(truth, train_rank[:, :100].tolist(), k=100) #w_file.write("s=%.3f,k1=%.1f, k3=%d, b=%.2f ,r=%d, f="%(score,k1,k3,b,i)+str(feature)+" q="+str(query_feature)+"\n") print(i, score) w_file.close() else: train_rank = output(que, num, num_doc, D_, r=r)[:, :100].tolist() return train_rank, file
def event_type_map_eval_ml_metrics(config, logger, result): print_and_log(logger, "====================================") print_and_log(logger, "event type MAP evaluation:") print_and_log( logger, "event type MAP: {}".format( ml_metrics.mapk(result['gt_all_event_id'], result['pred_all_event_id']))) k_list = [] for i in range(len(result['chain_name'])): k_list.append(len(result['gt_all_event_type'][i])) k_list = sorted(list(set(k_list))) k_list.remove(0) print_and_log(logger, "all possible k: {}".format(k_list)) for k in k_list: map_at_k = ml_metrics.mapk(result['gt_all_event_id'], result['pred_all_event_id'], k) print_and_log(logger, "event type MAP@{}: {}".format(int(k), map_at_k)) return result
def map_5_scorer(estimator, X, y): if X.shape[0] == 0: return 1 prob = estimator.predict_proba(X) labels = np.array(estimator.classes_) def top5(prob): indice = sorted(range(len(prob)), key=lambda k: prob[k], reverse=True) return labels[indice].tolist() y = map(lambda x:[x], y) y_pred = np.apply_along_axis(top5, axis=1, arr=prob) return mtr.mapk(y, y_pred, 5)
def run_map_test_dummy(data, items=None, probs=None, uniform=True, top=True, users=None, primaryEvent=cfg.testing.primary_event, K=10, no_progress=False): """Performs dummy test Args: data: list of event rows items: np.array or list of items sorted in descending popularity order probs: np.array or list of corresponding probabilities (needed for experiment #2) uniform: Boolean flag to use uniform sampling top: Boolean flag to use top items users: set of users to consider primaryEvent: str name of primary event K: int for MAP @ K no_progress: Boolean flag not to show the progress bar during calculations Returns: list of [MAP@1, MAP@2, ... MAP@K] evaluations """ d = {} for rec in data: if rec.event == primaryEvent: user = rec.entityId item = rec.targetEntityId if (users is None) or (user in users): d.setdefault(user, []).append(item) holdoutUsers = d.keys() prediction = [] ground_truth = [] if no_progress: gen = holdoutUsers else: gen = tqdm(holdoutUsers) for user in gen: if top: test_items = items[0:K] elif uniform: test_items = np.random.choice(items, size=(K, )) else: test_items = np.random.choice(items, size=(K, ), p=probs) prediction.append(test_items) ground_truth.append(d.get(user, [])) return [metrics.mapk(ground_truth, prediction, k) for k in range(1, K + 1)]
def train_test_partition(feature_list,labels_list): x = feature_list y = labels_list X_train, X_test,y_train,y_test = train_test_split(x,y,test_size=0.10) knn_cv = KNeighborsClassifier(n_neighbors=1) #svm_cf = svm.SVC(gamma=0.001) #cv_scores = cross_val_score(knn_cv, x,y, cv=2) #print(cv_scores) knn_cv.fit(X_train,y_train) expected = y_test predicted = knn_cv.predict(X_test) print(ml_metrics.mapk(expected,predicted,4))
def cal_map(pred_valid,cv,train_df,tr_data): df_pred = train_df[train_df['cv']==cv].copy() df_pred['pred'] = pred_valid df_pred = df_pred[['description_id','paper_id','pred']] sort_df_pred = df_pred.sort_values(['description_id', 'pred'], ascending=False) df_pred = df_pred[['description_id']].drop_duplicates() .merge(sort_df_pred, on=['description_id'], how='left') df_pred['rank'] = df_pred.groupby('description_id').cumcount().values df_pred = df_pred[df_pred['rank'] < 3] df_pred = df_pred.groupby(['description_id'])['paper_id'] .apply(lambda s : ','.join((s))).reset_index() df_pred = df_pred.merge(tr_data, on=['description_id'], how='left') df_pred.rename(columns={'paper_id': 'paper_ids'}, inplace=True) df_pred['paper_ids'] = df_pred['paper_ids'].apply(lambda s: s.split(',')) df_pred['target_id'] = df_pred['target_id'].apply(lambda s: [s]) return metrics.mapk(df_pred['target_id'].tolist(), df_pred['paper_ids'].tolist(), 3)
def show_results(query_path: str, method_names: List[str], matching_results, text_results): # if 'w5' in query_path: with open('./w5_query_devel.pkl', 'rb') as file: matching_dict = pickle.load(file) with open('./w5_text_bbox_list.pkl', 'rb') as file: text_dict = pickle.load(file) texts_sol = (seq(text_dict) .map(lambda p: Rectangle(p[0:2], (p[2] - p[0]) + 1, (p[3] - p[1]) + 1)) .to_list()) table = [] for pos, method_name in enumerate(method_names): # Matching results matching = ( seq(matching_results[pos]) .map(lambda r: r[1]) .map(lambda r: seq(r).map(lambda s: s.id).to_list()) .map(replace_empty) .to_list() ) matching_solution = seq(matching_results[pos]).map(lambda r: matching_dict[r[0].id][1]).to_list() # Text results text_iou = (seq(texts_sol) .zip(text_results[pos]) .map(lambda pair: pair[0].ioi(pair[1])) .average()) table.append((method_name, metrics.mapk(matching_solution, matching, k=10), metrics.mapk(matching_solution, matching, k=5), metrics.mapk(matching_solution, matching, k=1), text_iou)) data = pandas.DataFrame(table, columns=['Method', 'MAPK K=10', 'MAPK K=5', 'MAPK K=1', 'Text IoU']) print(data)
def ensemble_score(dfs, model_output_paths, df_valid, **cs): c = [cs["c%d" % m] for m in range(len(cs))] probas = defaultdict(lambda: defaultdict(float)) for m, df_model in enumerate(dfs): #logging.info("scoring %d, %s" % (m, model_output_paths[m])) for i in range(len(df_model)): probas[df_model["row_id"][i]][df_model["place_id"][i]] += c[m] * df_model["proba"][i] df = pd.DataFrame() df["row_id"] = probas.keys() df["place_id"] = df["row_id"].apply(lambda x: map(itemgetter(0), sorted(probas[x].items(), key=itemgetter(1), reverse=True)[:3])) df_merge = pd.merge(df, df_valid, how="left", on="row_id") valid_score = metrics.mapk(df_merge.place_id_label.values[:, None], df_merge.place_id.values, 3) return valid_score
def parseDict(probas, output_name, valid_file=None): df = pd.DataFrame() df["row_id"] = probas.keys() df["place_id"] = df["row_id"].apply(lambda x: map(itemgetter(0), sorted(probas[x].items(), key=itemgetter(1), reverse=True)[:3])) if valid_file is not None: df_valid = pd.read_csv(valid_file, usecols=["row_id", "place_id"]) df_valid.rename(columns={"place_id": "place_id_label"}, inplace=True) df_merge = pd.merge(df, df_valid, how="left", on="row_id") valid_score = metrics.mapk(df_merge.place_id_label.values[:, None], df_merge.place_id.values, 3) logging.info("total validation score: %f" % valid_score) del df_valid del df_merge return valid_score
def main(): data_provider = DataProvider(data_directory=Path('./data')) item_users = transform_to_item_user_csr_matrix( data_provider.get_purchases_train()) # baseline model model = get_model() np.random.seed(42) model.fit(item_users=item_users) test_user_ids, test_purchases = get_purchases_by_customer( data_provider.get_purchases_test()) recommendations = get_recommendations(model, test_user_ids, item_users) score = mapk(test_purchases, recommendations, k=10) return score
def cal_map_at_k(self): """Map @ top_k""" full, top_k = self._subjects, self._top_k users = list(dict.fromkeys(list(full['user']))) actual = [ list(full[(full['user'] == user) & (full['rank_true'] <= top_k)]['test_item']) for user in users ] predicted = [ list(full[(full['user'] == user) & (full['rank'] <= top_k)]['test_item']) for user in users ] return mapk(actual, predicted, k=top_k)
def test(model, data, target): ''' Predicts MAP score input : (model, data, target) output : MAP score ''' (x_test, y_test) = data #predicting x_decode = model.predict(x_test) # Find out the the top 5 hotel cluster predictions tmp = x_decode[:, 673:x_decode.shape[1]] predictions = [tmp[i].argsort()[-5:][::-1] for i in range(tmp.shape[0])] # Calculate the MAP score score = metrics.mapk(target, predictions, k=5) return score
def mapk_score(s_hidden, recs_pred, k=10): """ Computes the mean average precision at k (MAP@K) of recommendations. MAP@K = mean AP@K score over all users AP@K = (1 / min(m, k)) * sum from 1 to k of (precision at i * relevance of ith item) Where m is the number of items in a user's hidden set Where k is the number of items recommended to each user params: s_hidden: list of sets of hidden items for each user recs_pred: list of lists of recommended items, with each list k: number of recommendations to use in top set returns: float, range [0, 1] """ check_list_of_sets(s_hidden, "s_hidden") return ml_metrics.mapk(s_hidden, recs_pred, k)
def ensemble_score(dfs, model_output_paths, df_valid, **cs): c = [cs["c%d" % m] for m in range(len(cs))] probas = defaultdict(lambda: defaultdict(float)) for m, df_model in enumerate(dfs): #logging.info("scoring %d, %s" % (m, model_output_paths[m])) for i in range(len(df_model)): probas[df_model["row_id"][i]][df_model["place_id"] [i]] += c[m] * df_model["proba"][i] df = pd.DataFrame() df["row_id"] = probas.keys() df["place_id"] = df["row_id"].apply(lambda x: map( itemgetter(0), sorted(probas[x].items(), key=itemgetter(1), reverse=True)[:3])) df_merge = pd.merge(df, df_valid, how="left", on="row_id") valid_score = metrics.mapk(df_merge.place_id_label.values[:, None], df_merge.place_id.values, 3) return valid_score
def run_map_test_dummy(data, items=None, probs=None, uniform=True, top=True, users=None, primaryEvent=cfg.testing.primary_event, K=10, no_progress=False): """Performs dummy test Args: data: list of event rows items: np.array or list of items sorted in descending popularity order probs: np.array or list of corresponding probabilities (needed for experiment #2) uniform: Boolean flag to use uniform sampling top: Boolean flag to use top items users: set of users to consider primaryEvent: str name of primary event K: int for MAP @ K no_progress: Boolean flag not to show the progress bar during calculations Returns: list of [MAP@1, MAP@2, ... MAP@K] evaluations """ d = {} for rec in data: if rec.event == primaryEvent: user = rec.entityId item = rec.targetEntityId if not users or user in users: d.setdefault(user, []).append(item) holdoutUsers = [*d.keys()] prediction = [] ground_truth = [] if no_progress: gen = holdoutUsers else: gen = tqdm(holdoutUsers) for user in gen: if top: test_items = items[0:K] elif uniform: test_items = np.random.choice(items, size=(K,)) else: test_items = np.random.choice(items, size=(K,), p=probs) prediction.append(test_items) ground_truth.append(d.get(user, [])) return [metrics.mapk(ground_truth, prediction, k) for k in range(1, K + 1)]
def main(): step = 0.25 map_k = 3 save_name = 'sub_30nn.csv' split_val_time = 73000 train_inc_labels, test = readfiles() x_arange, y_arange = create_grid((0, 10), (0, 10), step) np.random.seed(2016) # train evaluating knn_result_list = [] label_list = [] for x_cell_min in x_arange: for y_cell_min in y_arange: print('Working on %f, %f cell' % (x_cell_min + step / 2, y_cell_min + step / 2)) cur_train, cur_test, cur_labels = find_index_in_cell(train_inc_labels, test, x_cell_min, y_cell_min, step) print('Train size is %d, test size is %d' % (cur_train.shape[0], cur_test.shape[0])) for i, probe in enumerate(cur_train.values): knn_result_list.append(list(functions_py.knn(probe, cur_train.values, cur_labels.values, self_test=True, mapk=map_k, k_nn=30))) label_list.append([cur_labels.values[i]]) print('The MAP3 score is %f' % mapk(label_list, knn_result_list, map_k)) print('***') np.random.seed(2016) # test predicting knn_ids_str = np.full((test.shape[0],), fill_value='0 1 2', dtype=object) for x_cell_min in x_arange: for y_cell_min in y_arange: print('Working on %f, %f cell' % (x_cell_min + step / 2, y_cell_min + step / 2)) cur_train, cur_test, cur_labels = find_index_in_cell(train_inc_labels, test, x_cell_min, y_cell_min, step) print('Train size is %d, test size is %d' % (cur_train.shape[0], cur_test.shape[0])) test_index = cur_test.index.values for i, probe in enumerate(cur_test.values): knn_ids_str[test_index[i]] = ' '.join(list(functions_py.knn(probe, cur_train.values, cur_labels.values, self_test=False, mapk=map_k, k_nn=20).astype(str))) # print(test_index[i], knn_ids_str[test_index[i]]) submission = pd.DataFrame.from_csv('sample_submission.csv') submission['place_id'] = knn_ids_str submission.to_csv(save_name)
""" MLing, CV """ print('CV') X_train, X_test, y_train, y_test = train_test_split(train_samp.values, target.values, test_size=0.33, random_state=42) classifier.fit(X_train, y_train) train_predict_prob = np.zeros((X_test.shape[0], n_classes)) for batch_i in np.arange(0, X_test.shape[0], test_batch): if (batch_i + test_batch) < X_test.shape[0]: train_predict_prob[batch_i: batch_i + test_batch, :] = \ classifier.predict_proba(X_test[batch_i: batch_i + test_batch, :]) else: train_predict_prob[batch_i:, :] = classifier.predict_proba(X_test[batch_i:, :]) train_predict_map = percent2mapk(train_predict_prob, 5) y_test_list = y2list(y_test) print('The mean average precision is %.4f' % mapk(y_test_list, train_predict_map, k=5)) train_predict_str = list2str(train_predict_map, ' ') """ MLing """ print('Batch predicting test') classifier.fit(train_samp.values, target.values) # Freeing memory del train_samp, target, X_train, X_test, y_train, y_test, train_predict_prob, train_predict_map if merge: test = pd.merge(test, destinations, left_on=test.srch_destination_id.values.astype(int), right_on=destinations.index.values, how='left') test = test.fillna(-10)
# NB w sample_weight clf = MultinomialNB(alpha = 0.07) clf.fit(Xtrain, Ytrain) #sample_weight = 0.1 + 0.5*train.is_booking) pred = clf.predict_proba(Xtest) pred_rank = np.apply_along_axis(lambda x: np.argsort(-x)[:5], 1, pred) print pred_rank.shape # pred_rank_prob = np.apply_along_axis(lambda x: x[np.argsort(-x)[:4]], 1, pred) # compute_map if Ytest.shape[0]==pred_rank.shape[0]: map_pred = metrics.mapk([[l] for l in Ytest], pred_rank, k=5) print map_pred # pred = clf.predict(Xtest) # acc = sum(pred==Ytest)/len(Ytest) # print acc # write output import pickle with open('test_id_lkp.pkl') as f: test_id_lkp = pickle.load(f) print len(test_id_lkp) with open('featurized/Xtest_train_test_users_click_10.pkl', 'w') as f: pickle.dump(Xtest, f)
def f5(test, predictions): target = [[l] for l in test['hotel_cluster']] return metrics.mapk(target, predictions, k=5)
def main(): #open files print "opening files" #destinations = pd.read_csv("data/destinations.csv") train = pd.read_csv("../data/train.csv", usecols=['srch_destination_id', 'hotel_market','hotel_cluster','user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance','date_time','user_id','is_booking'], dtype={'srch_destination_id':np.uint32, 'hotel_market':np.uint32, 'hotel_cluster':np.uint32, 'user_location_country':np.uint32, 'user_location_region':np.uint32, 'is_booking':np.bool}) test= pd.read_csv("../data/test.csv", usecols=['srch_destination_id', 'hotel_market','user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance','date_time','user_id'], dtype={'srch_destination_id':np.uint32, 'hotel_market':np.uint32, 'user_location_country':np.uint32, 'user_location_region':np.uint32}) print "files opened" train.date_time =pd.to_datetime(train.date_time) train["year"] = train.date_time.dt.year train["month"] = train.date_time.dt.month #select all users unique_user = train.user_id.unique() print "unique users", len(unique_user) #sel_user_ids = [unique_user[i] for i in sorted(random.sample(range(len(unique_user)),50000))] #sel_train = train[train.user_id.isin(sel_user_ids)] #jsut train on a subset of the data (seasonality) t1 = train[((train.year == 2013) | ((train.year == 2014) & (sel_train.month < 8)))] #t2 = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))] #t2 = t2[t2.is_booking==True] t2=test print "shape of t1 (train)", t1.shape print "shape of t2 (test)", t2.shape most_common_cl = list(train.hotel_cluster.value_counts().head().index) print "most common cluster prediction made", most_common_cl #clusters by destination id and type #match_cols = ["srch_destination_id", "srch_destination_type_id", "is_package", "hotel_market"] match_cols = ["srch_destination_id", "hotel_market"] cluster_cols = match_cols + ['hotel_cluster'] groups = t1.groupby(cluster_cols) top_clusters = {} for name, group in groups: clicks = len(group.is_booking[group.is_booking == False]) bookings = len(group.is_booking[group.is_booking == True]) score = bookings + .15 * clicks clus_name = make_key(name[:len(match_cols)]) if clus_name not in top_clusters: top_clusters[clus_name] = {} top_clusters[clus_name][name[-1]] = score cluster_dict = {} for n in top_clusters: tc = top_clusters[n] top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]] cluster_dict[n] = top preds = [] for index, row in t2.iterrows(): key = make_key([row[m] for m in match_cols]) if key in cluster_dict: preds.append(cluster_dict[key]) else: preds.append([]) print "basic prediction made" match_cols = ['user_location_city', 'orig_destination_distance'] groups = t1.groupby(match_cols) print "exact groups number: ", len(groups) exact_matches = [] for i in range(t2.shape[0]): exact_matches.append(generate_exact_matches(t2.iloc[i], match_cols, groups)) if i%1000==0: print "read ", i, group print "exact matches prediction made" basic_preds = [f5(most_common_cl)[:5] for p in range(len(preds))] print "basic", metrics.mapk([[l] for l in t2["hotel_cluster"]], basic_preds, k=5) region_preds = [f5(preds[p] + most_common_cl)[:5] for p in range(len(preds))] print "regional", metrics.mapk([[l] for l in t2["hotel_cluster"]], region_preds, k=5) full_preds = [f5(exact_matches[p] + preds[p] + most_common_cl)[:5] for p in range(len(preds))] print "full", metrics.mapk([[l] for l in t2["hotel_cluster"]], full_preds, k=5) #uncomment to write the file write_p = [" ".join([str(l) for l in p]) for p in preds] write_frame = ["{0},{1}".format(t2["id"][i], write_p[i]) for i in range(len(preds))] write_frame = ["id,hotel_cluster"] + write_frame with open("predictions_v2.csv", "w+") as f: f.write("\n".join(write_frame))
sys.exit() ### Code to get the mapk value ### print "Getting Eval Metric" import pandas as pd import numpy as np from ml_metrics import mapk preds_df = pd.read_csv("val_leak_preds.csv") preds = np.array( preds_df["hotel_cluster"].apply(lambda x: str(x).split(" ")) ) #preds = [pred for pred in preds] print preds[:10] found_count= 0 total_count = 0 item_count = 0 for pred in preds: if pred != ['nan']: found_count+=1 item_count += len(pred) total_count+=1 print "Item, Found and total : ", item_count,found_count, total_count actuals = np.array( pd.read_csv("../../Data/val.csv", usecols = ["hotel_cluster"])).astype('str') actuals = actuals.reshape(len(actuals),1) #actuals = [list(actual) for actual in actuals] print actuals[:10] print mapk(actuals, preds, k=5)
images = [] keys = [] for s in samples: images.append(prepare_photo(load_im(s))) keys.append(s['photo']) input_images = np.array(images).astype('float32') vectors = func(input_images) for i in range(len(vectors)): feature_vectors[keys[i]] = vectors[i] true_images = list() top20_images = list() for item in tqdm(filtered_test_ids): streets = test_ids[item][0] shops = test_ids[item][1] for test_case in streets: test_im = prepare_photo(load_im(test_case)) feature_vec = func(test_im.reshape((1, 3, 224, 224)).astype('float32')) results = [(i[0], cosine(feature_vec, i[1])) for i in feature_vectors.items()] results.sort(key=lambda x: x[1]) true_images.append([x['photo'] for x in shops]) top20_images.append([x[0] for x in results[:20]]) from ml_metrics import mapk print mapk(true_images, top20_images, k=20) print mapk(true_images, top20_images, k=10) print mapk(true_images, top20_images, k=5)
def process_one_cell(df_train, df_test, valid_mode_on, gx_id, gy_id, x_border, y_border, th, model_list): """ Classification inside one grid cell. """ #Working on df_train #filtering occurance smaller than th #consider border of cell df_cell_train = df_train.loc[(df_train.grid_cell_x == gx_id) & (df_train.grid_cell_y == gy_id)] x_min = df_cell_train.x.min() x_max = df_cell_train.x.max() y_min = df_cell_train.y.min() y_max = df_cell_train.y.max() df_cell_train = df_train.loc[(df_train.x >= x_min - x_border) & (df_train.x <= x_max + x_border) & (df_train.y >= y_min - y_border) & (df_train.y <= y_max + y_border)] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] #Working on df_test df_cell_test = df_test.loc[(df_test.grid_cell_x == gx_id) & (df_test.grid_cell_y == gy_id)] row_ids = df_cell_test.row_id.values #Preparing data #remove columns and encode label le = LabelEncoder() y_train = le.fit_transform(df_cell_train.place_id.values) l_train = df_cell_train.shape[0] l_test = df_cell_test.shape[0] n_class = len(le.classes_) logging.info("number of class: %d" % n_class) if valid_mode_on: logging.info("validation mode") logging.info("train size: %d, validation size: %d" % (l_train, l_test)) logging.info("%d labels in validation is not in train" % len(set(df_cell_test.place_id.values) - set(df_cell_train.place_id.values))) else: logging.info("prediction mode") logging.info("train size: %d, test size: %d" % (l_train, l_test)) df_cell_train_feats = df_cell_train.drop(['place_id', 'grid_cell_x', 'grid_cell_y', 'row_id'], axis=1) feats = df_cell_train_feats.columns.values df_cell_test_feats = df_cell_test[feats] y_test_pred = np.zeros((df_cell_test_feats.shape[0], n_class)) for clf in model_list: y_test_pred_model = clf(df_cell_train_feats, y_train, df_cell_test_feats) y_test_pred += y_test_pred_model if valid_mode_on: pred_labels = le.inverse_transform(np.argsort(y_test_pred, axis=1)[:, ::-1][:, :10]) valid_score = metrics.mapk(df_cell_test.place_id.values[:, None], pred_labels, 3) logging.info("valid score = %6.6f" % valid_score) else: valid_score = None #return list of (row_id, place_id, proba) top10_label = le.inverse_transform(np.argsort(y_test_pred, axis=1)[:, ::-1][:, :10]) top10_proba_raw = np.sort(y_test_pred, axis=1)[:, ::-1][:, :10] top10_proba = top10_proba_raw / np.sum(top10_proba_raw, axis=1)[:, None] probas = [] for i, rid in enumerate(row_ids): if i == 0: probas = np.array([[rid] * 10, top10_label[i], top10_proba[i]]).T else: probas = np.vstack([probas, np.array([[rid] * 10, top10_label[i], top10_proba[i]]).T]) return probas, valid_score, l_test
print (valid.shape, train.shape) cnt = train[train.clicked==1].ad_id.value_counts() cntall = train.ad_id.value_counts() del train def get_prob(k): if k not in cnt: return 0 return cnt[k]/(float(cntall[k]) + reg) def srt(x): ad_ids = map(int, x.split()) ad_ids = sorted(ad_ids, key=get_prob, reverse=True) return " ".join(map(str,ad_ids)) if eval: from ml_metrics import mapk y = valid[valid.clicked==1].ad_id.values y = [[_] for _ in y] p = valid.groupby('display_id').ad_id.apply(list) p = [sorted(x, key=get_prob, reverse=True) for x in p] print (mapk(y, p, k=12)) else: subm = pd.read_csv("./data/sample_submission.csv") subm['ad_id'] = subm.ad_id.apply(lambda x: srt(x)) subm.to_csv("subm_reg_1.csv", index=False)
""" import csv from ml_metrics import mapk import predict import settings def __read_rows(file_name): with open(file_name, "r", encoding="utf8") as file: return {int(r["AuthorId"]): [int(x) for x in r["PaperIds"].split()] for r in csv.DictReader(file)} if __name__ == "__main__": try: submission = __read_rows(settings.SUBMISSION_PATH) except FileNotFoundError: predict.submit_prediction() submission = __read_rows(settings.SUBMISSION_PATH) print("*** Calculate Mean Average Precision ***") print("\tbuilding valid solution") valid_solution = __read_rows(settings.VALID_SOLUTION_PATH) if sorted(valid_solution.keys()) != sorted(submission.keys()): print("The submission is incorrect: author ids are mismatched with the valid dataset") else: print("\tcalculating score") score = mapk([valid_solution[k] for k in valid_solution.keys()], [submission[k] for k in valid_solution.keys()]) print("\n*** Mean average precision for solution file: {0} ***".format(score))
print("Elapsed time column: %s minutes" % ((time.time() - start_time_column)/60)) def model(x_ranges, y_ranges, x_end, y_end, train, test): start_time = time.time() jobs = [] mgr = Manager() preds_total = mgr.dict(); for x_min, x_max in x_ranges: p = multiprocessing.Process(target=process_column, args=(x_min, x_max, y_ranges, \ x_end, y_end, train, test, preds_total)) jobs.append(p) p.start() if len(jobs) == 1: for proc in jobs: proc.join(); jobs = []; print("Elapsed time overall: %s minutes" % ((time.time() - start_time)/60)) preds_total = pd.concat(preds_total.values(), axis=0); print preds_total.shape return preds_total.sort_index(); predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1)); actual = test_cv[['place_id']].sort_index(); print actual.shape print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)
if classifier == RF or classifier == GBM: print "Feature importance", classifier.feature_importances_ print("Making predictions") predictions = classifier.predict_proba(test_features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[str(a_id)].append((pred,str(p_id))) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] predicted = paper_predictions.items() predicted.sort() print [x[0] for x in predicted[:5]] print [x[0] for x in test_ground_truth[:5]] mp.append(metrics.mapk([row[1] for row in test_ground_truth], [row[1] for row in predicted],10000)) print mp[k] print numpy.mean(mp) print numpy.std(mp) if __name__=="__main__": main()
def main(): print("Getting features for deleted papers from the database") #features_deleted = data_io.get_features_db("TrainDeleted") features_deleted = data_io.get_precomputed_features("DeletedFeatures") print("Getting features for confirmed papers from the database") #features_conf = data_io.get_features_db("TrainConfirmed") features_conf = data_io.get_precomputed_features("ConfirmedFeatures") print("Getting features for deleted papers from the database") #valid_features_deleted = data_io.get_features_db("ValidDeleted") valid_features_deleted = data_io.get_precomputed_features("ValidDeletedFeatures") print("Getting features for confirmed papers from the database") #valid_features_conf = data_io.get_features_db("ValidConfirmed") valid_features_conf = data_io.get_precomputed_features("ValidConfirmedFeatures") features = [x[2:] for x in features_deleted + features_conf] #+ valid_features_deleted + valid_features_conf target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] #+ [0 for x in range(len(valid_features_deleted))] + [1 for x in range(len(valid_features_conf))] print("Training the Classifier") RF = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, compute_importances=True, random_state=1) GBM = GradientBoostingClassifier(n_estimators=100, verbose=2, min_samples_split=10, random_state=1) classifier = RF classifier.fit(features, target) # Validation author_paper_ids = [x[:2] for x in valid_features_conf+valid_features_deleted] features = [x[2:] for x in valid_features_conf+valid_features_deleted] print("Making predictions") predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[str(a_id)].append((pred,str(p_id))) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] predicted = paper_predictions.items() predicted.sort() #Now I have sorted predictions for each author_id #Need to get the ground truth for the validation set: valid_confirmed_data = [row for row in csv.reader(open("ValidSolution.csv"))] #TrainConfirmed.csv valid_confirmed_papers = [(row[0],row[1].split()) for row in valid_confirmed_data[1:]] valid_confirmed_papers.sort() print predicted[0] print valid_confirmed_papers[0] import ml_metrics as metrics print metrics.mapk([row[1] for row in valid_confirmed_papers], [row[1] for row in predicted],10000)
for index, row in te.iterrows(): srch_destination_id = row["srch_destination_id"] is_booking = row["is_booking"] hotel_market = row["hotel_market"] hotel_country = row["hotel_country"] top_clusters = nlargest( 5, best_srch_destination_id[srch_destination_id], key=best_srch_destination_id[srch_destination_id].get ) if len(top_clusters) <= 5: item = nlargest(5, best_hotel_market[hotel_market], key=best_hotel_market[hotel_market].get) for i in item: if i not in top_clusters: top_clusters.append(i) if len(top_clusters) < 5: item = nlargest(5, best_hotel_country[hotel_country], key=best_hotel_country[hotel_market].get) for i in item: if i not in top_clusters: top_clusters.append(i) if len(top_clusters) < 5: item = nlargest(5, best_hotel, key=best_hotel.get) for i in item: if i not in top_clusters: top_clusters.append(i) prediction.append(top_clusters[:5]) ground_truth = [[l] for l in te["hotel_cluster"]] print "the rule-based method has MAP5 %s" % mapk(ground_truth, prediction, k=5) # 0.25024279168333224