Exemplo n.º 1
0
def test_extract_text(judge_path, index_path):
    judge_file = QRelFile(judge_path);
    docnos = judge_file.key2s();
    print 'doc number:', len(docnos);
    for docno in filter(is_cluewebB, docnos)[:3]:
        text = extract_text(docno, index_path);
        print text
        print '-' * 20
Exemplo n.º 2
0
def test_extract_text(judge_path, index_path, collection_type):
    judge_file = QRelFile(judge_path);
    docnos = judge_file.key2s();
    print 'doc number:', len(docnos);
    for docno in docnos[:1]:
        text = extract_text(docno, index_path, collection_type);
        print text
        print '-' * 20
Exemplo n.º 3
0
def exe_extract_text(judge_path, index_path, out_path, collection_type = 'html'):
    '''
        extract texts of docs in qrel from an index, and store them in out_path in standard trec format
    '''
    import Corpus
    judge_file = QRelFile(judge_path);
    docnos = judge_file.key2s();
    print 'doc number:', len(docnos);
    writer = Corpus.TRECWriter(out_path);
    for docno in docnos:
        text = extract_text(docno, index_path, collection_type)
        writer.write(Corpus.Document(docno, text))
Exemplo n.º 4
0
def exe_extract_text(judge_path, index_path, text_db_path):
    judge_file = QRelFile(judge_path);
    docnos = judge_file.key2s();
    docnos = filter(is_cluewebB, docnos);
    #docnos = docnos[:1000];
    print 'doc number:', len(docnos);
    db = bsddb.hashopen(text_db_path, 'w');
    count = 0;
    texts = fastmap.fastmap(lambda docno: extract_text(docno, index_path), 30, docnos);
    assert len(docnos) == len(texts);
    for i in xrange(len(docnos)): 
        db[docnos[i]] = texts[i];
    db.close();