def one_click_search(ini, query_str, outputs): if bool(ini.get('condition_no_cclparser', '')) or \ bool(ini.get('condition_baseline', '')): parser.USE_CCLPARSER = False if bool(ini.get('condition_no_boilerplate', '')) or \ bool(ini.get('condition_baseline', '')): html_to_trec.USE_BOILERPLATE = False if bool(ini.get('condition_patterns', '')): nugget_finder.USE_PATTERNS = True if bool(ini.get('condition_candidate_scorer', '')): nugget_finder.USE_CANDIDATE_SCORER = True if bool(ini.get('condition_no_ilp', '')): USE_ILP = False else: USE_ILP = True #### # fetch results from Web search engine (or cache) # (htmls, html_urls) = query_web_search(query_str, ini) print "found", len(htmls), "pages" #### # extract relevant nuggets # (scored_candidates, parsed_query, path_to_index) = find_nuggets(ini, htmls, query_str) #### # final output # final_passages = do_search(parsed_query, ini.get('search_command', './cpp/Search'), path_to_index, int(ini.get('main_search_passage_count', 3))) results = {} if USE_ILP: #### # assemble final output # for (final_length, output_type) in outputs: results[output_type] = assemble_output_ilp(final_passages, scored_candidates, final_length) else: #### # score final passages # final_passages_scored = score_passages(final_passages, scored_candidates) #### # assemble final output # for (final_length, output_type) in outputs: results[output_type] = assemble_output(final_passages_scored, final_length) return (results, html_urls)
def __call__(self, query): from nugget_finder import load_ini, do_search, identify_candidates return do_search(query, self.search_command, self.index_path, self.ret_size)
def gen_nugget_train(ini, htmls, query_str, good_text): from nugget_finder import load_ini, do_search, identify_candidates tmp_folder = ini.get('tmp_folder', './tmp') good_text = good_text.lower() #### # extract text from the HTML documents # sys.stderr.write("Extracting text...\n") path_to_corpus = "%s/to_index" % (tmp_folder,) if not os.path.exists(path_to_corpus): os.makedirs(path_to_corpus) html_count = 0 for html in htmls: outfile = "%s/%s.txt" % (path_to_corpus, html_count) cached_detag = "%s.txt" % (html,) if os.path.exists(cached_detag): copyfile(cached_detag, outfile) else: detag_html_file(infile=html,outfile=outfile,id=html_count) copyfile(outfile, cached_detag) html_count += 1 #### # build index # sys.stderr.write("Indexing...\n") path_to_index = "%s/index" % (tmp_folder,) if not os.path.exists(path_to_index): os.makedirs(path_to_index) config_template = file(ini.get('index_config_template', "./indexing.template")).read() config_filename = "%s/indexing.param" % (tmp_folder,) config_file = open(config_filename, "w") config_file.write(config_template.format(path_to_corpus=path_to_corpus, path_to_index=path_to_index)) config_file.close() index_command = ini.get('index_command', 'IndriBuildIndex') retcode=subprocess.call([ index_command, config_filename ], stdout=sys.stderr, stderr=sys.stderr) assert retcode==0 #### # generate query # parsed_query = parse_into_chunks(query_str) if bool(ini.get('condition_baseline', '')): print "baseline run." return ([], parsed_query, path_to_index) #### # main search # sys.stderr.write("Main search...\n") search_command = ini.get('search_command', './cpp/Search') main_passages = do_search(parsed_query, search_command, path_to_index, 2000) #print 'passage num:', len(main_passages) #### # identify candidates # sys.stderr.write("Identifying candidates...\n") top_documents = int(ini.get('top_documents_for_candidate', '20')) candidates, main_evidence = identify_candidates(main_passages, int(ini.get('main_search_passage_count', 3)), top_documents) print 'candidate num:', len(candidates) ### # evidence search # sys.stderr.write("Evidence searching...\n") evidence = dict() t0 = time.time() searcher = Searcher(search_command, path_to_index, int(ini.get('evidence_search_passage_count', 10))) p = Pool(8) queries = map(lambda candidate: list(parsed_query) + [('NE', candidate[1] )], candidates) evidence_passages_list = p.map(searcher, queries, 50) p.close() print 'pool map evidence search:', time.time() - t0; for i in xrange(len(candidates)): candidate = candidates[i] evidence[candidate[0]] = filter(lambda passage: all(map(lambda token: token.lower() in passage[1].lower(), candidate[1])), evidence_passages_list[i]) #### # evaluate evidence # sys.stderr.write("Generating Training...\n") instances = [] total = len(evidence) t0 = time.time() gen = TrainGenerator(ini.get('dumpindex_command', 'dumpindex'), ini.get('stat_index')) inputs = map(lambda candidate: (candidate, evidence[candidate], main_evidence[candidate], good_text), evidence.keys()) p = Pool(8) instances = p.map(gen, inputs, 50) p.close() print 'pool map evaluating:', time.time() - t0 #### # clean up # for i in xrange(0, html_count): try: os.unlink("%s/to_index/%s.txt" % (tmp_folder, i)) except: pass return instances
def gen_nugget_train(ini, htmls, query_str, good_text): from nugget_finder import load_ini, do_search, identify_candidates tmp_folder = ini.get('tmp_folder', './tmp') good_text = good_text.lower() #### # extract text from the HTML documents # sys.stderr.write("Extracting text...\n") path_to_corpus = "%s/to_index" % (tmp_folder, ) if not os.path.exists(path_to_corpus): os.makedirs(path_to_corpus) html_count = 0 for html in htmls: outfile = "%s/%s.txt" % (path_to_corpus, html_count) cached_detag = "%s.txt" % (html, ) if os.path.exists(cached_detag): copyfile(cached_detag, outfile) else: detag_html_file(infile=html, outfile=outfile, id=html_count) copyfile(outfile, cached_detag) html_count += 1 #### # build index # sys.stderr.write("Indexing...\n") path_to_index = "%s/index" % (tmp_folder, ) if not os.path.exists(path_to_index): os.makedirs(path_to_index) config_template = file( ini.get('index_config_template', "./indexing.template")).read() config_filename = "%s/indexing.param" % (tmp_folder, ) config_file = open(config_filename, "w") config_file.write( config_template.format(path_to_corpus=path_to_corpus, path_to_index=path_to_index)) config_file.close() index_command = ini.get('index_command', 'IndriBuildIndex') retcode = subprocess.call([index_command, config_filename], stdout=sys.stderr, stderr=sys.stderr) assert retcode == 0 #### # generate query # parsed_query = parse_into_chunks(query_str) if bool(ini.get('condition_baseline', '')): print "baseline run." return ([], parsed_query, path_to_index) #### # main search # sys.stderr.write("Main search...\n") search_command = ini.get('search_command', './cpp/Search') main_passages = do_search(parsed_query, search_command, path_to_index, 2000) #print 'passage num:', len(main_passages) #### # identify candidates # sys.stderr.write("Identifying candidates...\n") top_documents = int(ini.get('top_documents_for_candidate', '20')) candidates, main_evidence = identify_candidates( main_passages, int(ini.get('main_search_passage_count', 3)), top_documents) print 'candidate num:', len(candidates) ### # evidence search # sys.stderr.write("Evidence searching...\n") evidence = dict() t0 = time.time() searcher = Searcher(search_command, path_to_index, int(ini.get('evidence_search_passage_count', 10))) p = Pool(8) queries = map( lambda candidate: list(parsed_query) + [('NE', candidate[1])], candidates) evidence_passages_list = p.map(searcher, queries, 50) p.close() print 'pool map evidence search:', time.time() - t0 for i in xrange(len(candidates)): candidate = candidates[i] evidence[candidate[0]] = filter( lambda passage: all( map(lambda token: token.lower() in passage[1].lower(), candidate[1])), evidence_passages_list[i]) #### # evaluate evidence # sys.stderr.write("Generating Training...\n") instances = [] total = len(evidence) t0 = time.time() gen = TrainGenerator(ini.get('dumpindex_command', 'dumpindex'), ini.get('stat_index')) inputs = map( lambda candidate: (candidate, evidence[candidate], main_evidence[candidate], good_text), evidence.keys()) p = Pool(8) instances = p.map(gen, inputs, 50) p.close() print 'pool map evaluating:', time.time() - t0 #### # clean up # for i in xrange(0, html_count): try: os.unlink("%s/to_index/%s.txt" % (tmp_folder, i)) except: pass return instances