def prep_query_log(): query_log_filepath = args.query_log data_dir = args.data_dir query_dict = {} doc_dict = {} query_count = defaultdict(lambda: 0) qd_ctr = defaultdict(lambda: defaultdict(lambda: [0, 0])) ind = 0 last_query = '#' for fetch in load_query_log(query_log_filepath, format='bing', iterable=True): ind += 1 if ind % 1000000 == 0: print('processed {} m'.format(ind / 1000000)) uid, sid, sstime, nq, qid, q, qtime, nc, rank, url, ctime, dtime = fetch if q not in query_dict: query_dict[q] = len(query_dict) if url not in doc_dict: doc_dict[url] = len(doc_dict) qd_ctr[query_dict[q]][doc_dict[url]][1] += 1 qd_ctr[query_dict[q]][doc_dict[url]][0] += int(dtime != 0) if uid + qid + qtime != last_query: query_count[q] += 1 last_query = uid + qid + qtime for qid in qd_ctr: for docid in qd_ctr[qid]: qd_ctr[qid][docid] = qd_ctr[qid][docid][0] / qd_ctr[qid][docid][1] save_query_file([(v, k) for k, v in query_dict.items()], os.path.join(data_dir, 'query_all')) save_query_file(sorted(query_count.items(), key=lambda x: -x[1]), os.path.join(data_dir, 'query_freq')) save_judge_file(qd_ctr, os.path.join(data_dir, 'judgement_DCTR')) save_query_file([(v, k) for k, v in doc_dict.items()], os.path.join(data_dir, 'docid_to_url'))
def click_to_rel(): data_dir = args.data_dir judge_click = os.path.join(data_dir, 'judgement_DCTR') judge_refer = args.judgement_refer judge_click = load_judge_file(judge_click, scale=float) judge_refer = load_judge_file(judge_refer, scale=int) rels = [] for q in judge_refer: for d in judge_refer[q]: rels.append(judge_refer[q][d]) clicks = [] for q in judge_click: for d in judge_click[q]: clicks.append(judge_click[q][d]) rels = sorted(rels) clicks = sorted(clicks) if len(rels) <= 0 or len(clicks) <= 0: raise Exception('judgement has no record') ratio = [] last = '#' for i in range(len(rels)): r = rels[i] if r != last: ratio.append([r, 0]) if len(ratio) > 1: ratio[-2][1] = i / len(rels) last = r ratio[-1][1] = 1 threshold = [] k = 0 last = '#' for i in range(len(clicks)): while i / len(clicks) >= ratio[k][1]: k += 1 if last != '#' and last[0] != ratio[k][0]: threshold.append(last) last = [ratio[k][0], clicks[i]] threshold.append(last) print('ratio: {}'.format(ratio)) print('threshold: {}'.format(threshold)) threshold = [[0, 0.05], [1, 0.3], [2, 1]] # my guess judge_rel = defaultdict(lambda: defaultdict(lambda: None)) def click2rel(click): k = 0 while click > threshold[k][1]: k += 1 return threshold[k][0] for q in judge_click: for d in judge_click[q]: judge_rel[q][d] = click2rel(judge_click[q][d]) save_judge_file(judge_rel, os.path.join(data_dir, 'judgement_rel'))
def filter_judgement(): filtered_ext = ['.pdf', '.ppt', '.pptx', '.doc', '.docx', '.txt'] filtered_ext = tuple(filtered_ext + [ext.upper() for ext in filtered_ext]) allowed_ext = tuple(['html', 'htm', 'com', 'cn', 'asp', 'shtml', 'php']) data_dir = args.data_dir docid_to_url = load_from_query_file(os.path.join(data_dir, 'docid_to_url')) qd_judge = load_judge_file(os.path.join(data_dir, 'judgement_rel')) qd_judge_new = defaultdict(lambda: defaultdict(lambda: None)) count = 0 for q in qd_judge: for d in qd_judge[q]: if docid_to_url[d].endswith(filtered_ext): count += 1 continue qd_judge_new[q][d] = qd_judge[q][d] print('#non-html url: {}'.format(count)) save_judge_file(qd_judge_new, os.path.join(data_dir, 'judgement'))