示例#1
0
文件: prep.py 项目: jzbjyb/rri
def prep_query_log():
    query_log_filepath = args.query_log
    data_dir = args.data_dir
    query_dict = {}
    doc_dict = {}
    query_count = defaultdict(lambda: 0)
    qd_ctr = defaultdict(lambda: defaultdict(lambda: [0, 0]))
    ind = 0
    last_query = '#'
    for fetch in load_query_log(query_log_filepath,
                                format='bing',
                                iterable=True):
        ind += 1
        if ind % 1000000 == 0:
            print('processed {} m'.format(ind / 1000000))
        uid, sid, sstime, nq, qid, q, qtime, nc, rank, url, ctime, dtime = fetch
        if q not in query_dict:
            query_dict[q] = len(query_dict)
        if url not in doc_dict:
            doc_dict[url] = len(doc_dict)
        qd_ctr[query_dict[q]][doc_dict[url]][1] += 1
        qd_ctr[query_dict[q]][doc_dict[url]][0] += int(dtime != 0)
        if uid + qid + qtime != last_query:
            query_count[q] += 1
            last_query = uid + qid + qtime
    for qid in qd_ctr:
        for docid in qd_ctr[qid]:
            qd_ctr[qid][docid] = qd_ctr[qid][docid][0] / qd_ctr[qid][docid][1]
    save_query_file([(v, k) for k, v in query_dict.items()],
                    os.path.join(data_dir, 'query_all'))
    save_query_file(sorted(query_count.items(), key=lambda x: -x[1]),
                    os.path.join(data_dir, 'query_freq'))
    save_judge_file(qd_ctr, os.path.join(data_dir, 'judgement_DCTR'))
    save_query_file([(v, k) for k, v in doc_dict.items()],
                    os.path.join(data_dir, 'docid_to_url'))
示例#2
0
文件: prep.py 项目: jzbjyb/rri
def click_to_rel():
    data_dir = args.data_dir
    judge_click = os.path.join(data_dir, 'judgement_DCTR')
    judge_refer = args.judgement_refer
    judge_click = load_judge_file(judge_click, scale=float)
    judge_refer = load_judge_file(judge_refer, scale=int)
    rels = []
    for q in judge_refer:
        for d in judge_refer[q]:
            rels.append(judge_refer[q][d])
    clicks = []
    for q in judge_click:
        for d in judge_click[q]:
            clicks.append(judge_click[q][d])
    rels = sorted(rels)
    clicks = sorted(clicks)
    if len(rels) <= 0 or len(clicks) <= 0:
        raise Exception('judgement has no record')
    ratio = []
    last = '#'
    for i in range(len(rels)):
        r = rels[i]
        if r != last:
            ratio.append([r, 0])
            if len(ratio) > 1:
                ratio[-2][1] = i / len(rels)
        last = r
    ratio[-1][1] = 1
    threshold = []
    k = 0
    last = '#'
    for i in range(len(clicks)):
        while i / len(clicks) >= ratio[k][1]:
            k += 1
        if last != '#' and last[0] != ratio[k][0]:
            threshold.append(last)
        last = [ratio[k][0], clicks[i]]
    threshold.append(last)
    print('ratio: {}'.format(ratio))
    print('threshold: {}'.format(threshold))
    threshold = [[0, 0.05], [1, 0.3], [2, 1]]  # my guess
    judge_rel = defaultdict(lambda: defaultdict(lambda: None))

    def click2rel(click):
        k = 0
        while click > threshold[k][1]:
            k += 1
        return threshold[k][0]

    for q in judge_click:
        for d in judge_click[q]:
            judge_rel[q][d] = click2rel(judge_click[q][d])
    save_judge_file(judge_rel, os.path.join(data_dir, 'judgement_rel'))
示例#3
0
文件: prep.py 项目: jzbjyb/rri
def filter_judgement():
    filtered_ext = ['.pdf', '.ppt', '.pptx', '.doc', '.docx', '.txt']
    filtered_ext = tuple(filtered_ext + [ext.upper() for ext in filtered_ext])
    allowed_ext = tuple(['html', 'htm', 'com', 'cn', 'asp', 'shtml', 'php'])
    data_dir = args.data_dir
    docid_to_url = load_from_query_file(os.path.join(data_dir, 'docid_to_url'))
    qd_judge = load_judge_file(os.path.join(data_dir, 'judgement_rel'))
    qd_judge_new = defaultdict(lambda: defaultdict(lambda: None))
    count = 0
    for q in qd_judge:
        for d in qd_judge[q]:
            if docid_to_url[d].endswith(filtered_ext):
                count += 1
                continue
            qd_judge_new[q][d] = qd_judge[q][d]
    print('#non-html url: {}'.format(count))
    save_judge_file(qd_judge_new, os.path.join(data_dir, 'judgement'))