示例#1
0
文件: prep.py 项目: jzbjyb/rri
def filter_query():
    data_dir = args.data_dir
    min_query_freq = args.min_query_freq
    query2freq = load_from_query_file(os.path.join(data_dir, 'query_freq'))
    qid2query = load_from_query_file(os.path.join(data_dir, 'query_all'))
    save_query_file([(k, v) for k, v in qid2query.items()
                     if int(query2freq[v]) >= min_query_freq],
                    os.path.join(data_dir, 'query'))
示例#2
0
文件: prep.py 项目: jzbjyb/rri
def shuqi_bing_redirect():
    MARK = b'\t-----\t'
    data_dir = args.data_dir
    shuqi_bing_web_dir = args.shuqi_bing_web_dir
    docid_to_url = load_from_query_file(os.path.join(data_dir, 'docid_to_url'))
    print('#all url: {}'.format(len(docid_to_url)))
    url_to_docid = {v: k for k, v in docid_to_url.items()}
    count = 0
    wrong_url_count = 0
    with open(os.path.join(shuqi_bing_web_dir, 'allweb.txt'), 'r') as fp:
        for l in fp:
            l = l.strip()
            url, ind = l.split('\t')
            if url not in url_to_docid:
                wrong_url_count += 1
                continue
            old_path = os.path.join(shuqi_bing_web_dir,
                                    'web{}.txt'.format(ind))
            if not os.path.exists(old_path):
                continue
            count += 1
            if count % 100000 == 0:
                print('count: {}w'.format(count / 10000))
            new_ind = url_to_docid[url]
            with open(os.path.join(data_dir, 'docs', new_ind + '.html'),
                      'wb') as nh:
                try:
                    h = open(old_path, 'rb').read()
                except:
                    print('read error: {}'.format(old_path))
                    raise
                nh.write(h[h.find(MARK) + len(MARK):])
    print('#downloaded url: {}, #wrong url: {}'.format(count, wrong_url_count))
示例#3
0
文件: prep.py 项目: jzbjyb/rri
def generate_train_test():
    data_dir = args.data_dir
    query_filepath = os.path.join(data_dir, 'query')
    judge_filepath = os.path.join(data_dir, 'judgement')
    run_filepath = os.path.join(data_dir, 'run')
    # split train and test dataset based on queries rather than qid
    query_dict = load_from_query_file(query_filepath)
    unique_queries = np.unique(list(query_dict.values()))
    np.random.shuffle(unique_queries)
    train_size = int(len(unique_queries) * args.train_test_ratio)
    test_size = len(unique_queries) - train_size
    if train_size <= 0 or test_size <= 0:
        raise Exception('train test dataset size is incorrect')
    print('#unique queries: {}, train size: {}, test size: {}'.format(
        len(unique_queries), train_size, test_size))
    train_queries = set(unique_queries[:train_size])
    test_queries = set(unique_queries[train_size:])
    train_qids = set([q for q in query_dict if query_dict[q] in train_queries])
    test_qids = set([q for q in query_dict if query_dict[q] in test_queries])
    miss_docs = set()
    have_docs = set()
    train_samples = []
    test_samples = []
    qd_judge = load_judge_file(judge_filepath)
    for q in qd_judge:
        for d in qd_judge[q]:
            if qd_judge[q][d] is None:  # skip documents without judgement
                continue
            if not os.path.exists(os.path.join(data_dir, 'docs', d + '.html')):
                miss_docs.add(d)
                continue
            have_docs.add(d)
            if q in train_qids:
                train_samples.append((q, d, qd_judge[q][d]))
            elif q in test_qids and not os.path.exists(run_filepath):
                test_samples.append((q, d, qd_judge[q][d]))
    if os.path.exists(run_filepath):
        run_result = load_run_file(run_filepath)
        for q, _, d, rank, score, _ in run_result:
            if qd_judge[q][d] is None:  # skip documents without judgement
                continue
            if not os.path.exists(os.path.join(data_dir, 'docs', d + '.html')):
                miss_docs.add(d)
                continue
            have_docs.add(d)
            if q in test_qids:
                test_samples.append((q, d, qd_judge[q][d]))
    print('have {} docs, miss {} docs'.format(len(have_docs), len(miss_docs)))
    save_train_test_file(train_samples,
                         os.path.join(data_dir, 'train.pointwise'))
    save_train_test_file(test_samples, os.path.join(data_dir,
                                                    'test.pointwise'))
示例#4
0
文件: prep.py 项目: jzbjyb/rri
def filter_judgement():
    filtered_ext = ['.pdf', '.ppt', '.pptx', '.doc', '.docx', '.txt']
    filtered_ext = tuple(filtered_ext + [ext.upper() for ext in filtered_ext])
    allowed_ext = tuple(['html', 'htm', 'com', 'cn', 'asp', 'shtml', 'php'])
    data_dir = args.data_dir
    docid_to_url = load_from_query_file(os.path.join(data_dir, 'docid_to_url'))
    qd_judge = load_judge_file(os.path.join(data_dir, 'judgement_rel'))
    qd_judge_new = defaultdict(lambda: defaultdict(lambda: None))
    count = 0
    for q in qd_judge:
        for d in qd_judge[q]:
            if docid_to_url[d].endswith(filtered_ext):
                count += 1
                continue
            qd_judge_new[q][d] = qd_judge[q][d]
    print('#non-html url: {}'.format(count))
    save_judge_file(qd_judge_new, os.path.join(data_dir, 'judgement'))
示例#5
0
文件: prep.py 项目: jzbjyb/rri
def preprocess():
    binary = args.binary_html
    data_dir = args.data_dir
    max_vocab_size = args.max_vocab_size
    docs_dir = os.path.join(data_dir, 'docs')
    query_filepath = os.path.join(data_dir, 'query')
    train_filepath = os.path.join(data_dir, 'train.pointwise')
    test_filepath = os.path.join(data_dir, 'test.pointwise')
    vocab = Vocab(max_size=max_vocab_size)
    train_query_ids, train_doc_ids = get_query_doc_ids(train_filepath)
    test_query_ids, test_doc_ids = get_query_doc_ids(test_filepath)
    query_ids = train_query_ids | test_query_ids
    doc_ids = train_doc_ids | test_doc_ids
    print('total query: {}, total doc: {}'.format(len(query_ids),
                                                  len(doc_ids)))
    query_dict = load_from_query_file(query_filepath)
    doc_dict = {}
    for qid in sorted(train_query_ids):
        for term in query_dict[qid].split():
            vocab.add(term)
    count = 0
    for docid in sorted(train_doc_ids):
        count += 1
        if count % 10000 == 0:
            print('processed {}w docs'.format(count // 10000))
        doc_body = load_from_html_cascade(os.path.join(docs_dir,
                                                       docid + '.html'),
                                          binary=binary)['body']
        doc_dict[docid] = doc_body
        #print(docid)
        #print(' '.join(doc_body))
        #input()
        for term in doc_body:
            vocab.add(term)
    vocab.build()
    vocab.save_to_file(os.path.join(data_dir, 'vocab'))
    empty_qid, empty_docid = set(), set()
    with open(os.path.join(data_dir, 'query.prep'), 'w') as fp:
        for qid in sorted(query_ids):
            qt = query_dict[qid].split()
            if len(qt) == 0:
                empty_qid.add(qid)
                continue
            fp.write('{}\t{}\n'.format(
                qid, ' '.join(map(lambda x: str(x), vocab.encode(qt)))))
    with open(os.path.join(data_dir, 'docs.prep'), 'w') as fp:
        for docid in sorted(doc_ids):
            if docid in doc_dict:
                doc_body = doc_dict[docid]
            else:
                doc_body = load_from_html_cascade(os.path.join(
                    docs_dir, docid + '.html'),
                                                  binary=binary)['body']
            if len(doc_body) == 0:
                empty_docid.add(docid)
                continue
            fp.write('{}\t{}\n'.format(
                docid, ' '.join(map(lambda x: str(x),
                                    vocab.encode(doc_body)))))
    print('have {} empty query, have {} empty doc'.format(
        len(empty_qid), len(empty_docid)))
    filter_samples(train_filepath,
                   '{}.prep.{}'.format(*train_filepath.rsplit('.', 1)),
                   empty_qid, empty_docid)
    filter_samples(test_filepath,
                   '{}.prep.{}'.format(*test_filepath.rsplit('.', 1)),
                   empty_qid, empty_docid)