def exe_extract_feature(argv): window_path, doc_path, topic_path, judge_path, word_stat_path, out_path = argv; judge_file = QRelFile(judge_path); topics = StandardFormat().read(topic_path); global window_db, doc_db, word_stat, model_factory; window_db = bsddb.hashopen(window_path); doc_db = bsddb.hashopen(doc_path); word_stat = load_word_stat(word_stat_path); model_factory = DocumentModelFactory(word_stat); writer = open(out_path, 'w'); global topic_chain, window_chain, doc_chain; topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory)]); window_chain = topic_chain; doc_chain = TextChain([TextTokenizer(word_tokenize),TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory), WindowWorker(window_chain), DocumentTitleWorker(topic_chain)]) global topic_id; topic_ids = judge_file.keys(); for topic_id in topic_ids: if not topics.has_key(topic_id): continue; topic_str = topics[topic_id]; print topic_id; global topic; topic = TextPiece(topic_str); topic_chain.work(topic); p = Pool(task_num); lines_group = p.map(multithread_extract_feature, judge_file[topic_id].items()); for lines in lines_group: for line in lines: writer.write(line); writer.write('\n'); writer.close();
def exe_extract_windows(argv): topic_path, judge_path, text_db_path, windows_db_path = argv; text_db = bsddb.hashopen(text_db_path); window_db = bsddb.hashopen(windows_db_path, 'w'); judge_file = QRelFile(judge_path); topics = StandardFormat().read(topic_path); topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover('data/stoplist.dft'), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]); sentence_chain = TextChain([TextTokenizer(word_tokenize), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]); for topic_id, topic_str in topics.items(): print topic_id; sys.stdout.flush(); topic = TextPiece(topic_str); topic_chain.work(topic); if not judge_file.has_key(topic_id): continue; docnos = judge_file[topic_id].keys(); for docno in docnos: if not is_cluewebB(docno): continue; doc_text = text_db[docno]; window_candidates = match_window(topic, doc_text, sentence_chain); sentences = map(lambda text_piece: text_piece.text, window_candidates); text = '\n'.join(sentences); window_db[docno] = text.encode('utf8'); window_db.close();
def exe_build_train(argv): #1. create the workers; judge_path, topic_path, word_stat_path, doc_path, window_path, out_path = argv; global judge_file, topics, doc_db, window_db, word_stat, ranker; judge_file = QRelFile(judge_path); topics = StandardFormat().read(topic_path); doc_db = bsddb.hashopen(doc_path); window_db = bsddb.hashopen(window_path); word_stat = load_word_stat(word_stat_path); # aggregators = map(lambda k: Aggregator(k), K_options); # ranker = DistanceWindowRanker(CosTextScorer(), DocumentModelFactory(word_stat),aggregators); ranker = RetrievalWindowRanker(CosTextScorer(), DocumentModelFactory(word_stat)); #2. build the training data; # p = Pool(4); topic_ids = judge_file.keys(); # docs_groups = p.map(build_train, topic_ids); docs_groups = map(build_train, topic_ids); assert len(docs_groups) == len(topic_ids); #3. write out the training data writer = open(out_path, 'w'); for i in xrange(len(topic_ids)): topic_id = topic_ids[i]; docs = docs_groups[i]; for doc in docs: docno = doc.docno; judge = judge_file[topic_id][docno]; for scores, sentence_id in doc.score_windows: score_str = ','.join(map(str, scores)); writer.write('%s %s %s %d %s\n' % (topic_id, docno, judge, sentence_id, score_str)); writer.close();
def test_extract_text(judge_path, index_path): judge_file = QRelFile(judge_path); docnos = judge_file.key2s(); print 'doc number:', len(docnos); for docno in filter(is_cluewebB, docnos)[:3]: text = extract_text(docno, index_path); print text print '-' * 20
def test_extract_text(judge_path, index_path, collection_type): judge_file = QRelFile(judge_path); docnos = judge_file.key2s(); print 'doc number:', len(docnos); for docno in docnos[:1]: text = extract_text(docno, index_path, collection_type); print text print '-' * 20
def exe_extract_text(judge_path, index_path, out_path, collection_type = 'html'): ''' extract texts of docs in qrel from an index, and store them in out_path in standard trec format ''' import Corpus judge_file = QRelFile(judge_path); docnos = judge_file.key2s(); print 'doc number:', len(docnos); writer = Corpus.TRECWriter(out_path); for docno in docnos: text = extract_text(docno, index_path, collection_type) writer.write(Corpus.Document(docno, text))
def exe_extract_text(judge_path, index_path, text_db_path): judge_file = QRelFile(judge_path); docnos = judge_file.key2s(); docnos = filter(is_cluewebB, docnos); #docnos = docnos[:1000]; print 'doc number:', len(docnos); db = bsddb.hashopen(text_db_path, 'w'); count = 0; texts = fastmap.fastmap(lambda docno: extract_text(docno, index_path), 30, docnos); assert len(docnos) == len(texts); for i in xrange(len(docnos)): db[docnos[i]] = texts[i]; db.close();
def exe_stat_window(qrel_path, window_db_path): window_db = bsddb.hashopen(window_db_path); qrel = QRelFile(qrel_path); sentence_nums = []; sentence_lens = []; for q in qrel.keys(): for d in qrel.get(q).keys(): if window_db.has_key(d): window = window_db[d]; sentences = window.split('\n'); sentence_nums.append(len(sentences)); sentence_lens += map(lambda sentence: len(sentence.split()), sentences); print np.mean(sentence_nums), np.median(sentence_nums), np.mean(sentence_lens), np.median(sentence_lens);
def exe_example(snippet_judge_path, doc_judge_path): #def exe_example(snippet_judge_path, doc_judge_path, bing_path, sum_path, dsm_path): from JudgeFile import QRelFile; snippet_judge = load_snippet_judge(snippet_judge_path); doc_judge = QRelFile(doc_judge_path); sources = snippet_judge.keys(); for topic_id in snippet_judge[sources[0]].keys(): for docno in snippet_judge[sources[0]][topic_id]: in_rel = int(doc_judge.get_value(topic_id, docno)); if in_rel <= 0: in_rel = 0; elif in_rel > 0: in_rel = 1; if snippet_judge['bing'].has_key(topic_id) and snippet_judge['pablo.short'].has_key(topic_id) and snippet_judge['windowshop.oq'].has_key(topic_id) : bing_per_rel = int(snippet_judge['bing'][topic_id][docno]); sum_per_rel = int(snippet_judge['pablo.short'][topic_id][docno]); oq_per_rel = int(snippet_judge['windowshop.oq'][topic_id][docno]); if (in_rel <> bing_per_rel or in_rel <> sum_per_rel) and in_rel == oq_per_rel == 1: print topic_id, docno, in_rel, bing_per_rel, sum_per_rel, oq_per_rel;
import sys from JudgeFile import QRelFile from TRECTopics import StandardFormat from TrainGenerator import *; topics = StandardFormat().read(sys.argv[1]); judge_file = QRelFile(sys.argv[2]) lemmas = set() topic_ids = judge_file.keys() for topic_id in topic_ids: if not topics.has_key(topic_id): continue topic_str = topics[topic_id] topic = TextPiece(topic_str) lemmas.update(topic.lemmas) for lemma in lemmas: print lemma