def do_gen_nugget_train(ini_path): from nugget_finder import load_ini, do_search, identify_candidates from one_click_search import query_web_search ini = load_ini(ini_path) nugget_finder.USE_PATTERNS = True if bool(ini.get('condition_no_cclparser', '')) or \ bool(ini.get('condition_baseline', '')): parser.USE_CCLPARSER = False if bool(ini.get('condition_no_boilerplate', '')) or \ bool(ini.get('condition_baseline', '')): html_to_trec.USE_BOILERPLATE = False records = read_groundtruth(ini.get('ground_truth')) writer = open(ini.get('train_path'), 'w') for query_str, good_text in records: tmp_folder = ini.get('tmp_folder', '/tmp') print 'query:', query_str (htmls, html_urls) = query_web_search(query_str, ini) print "found", len(htmls), "pages" instances = gen_nugget_train(ini, htmls, query_str, good_text) for candidate, is_good, features in instances: writer.write('%d,%s#%s\n' % (is_good, ','.join(map(lambda feature: str(feature), features)), '%s:%s' % (query_str, candidate))) writer.flush() try: rmtree(tmp_folder) os.mkdir(tmp_folder) except Exception as e: print e writer.close()
def do_gen_nugget_train(ini_path): from nugget_finder import load_ini, do_search, identify_candidates from one_click_search import query_web_search ini = load_ini(ini_path) nugget_finder.USE_PATTERNS = True if bool(ini.get('condition_no_cclparser', '')) or \ bool(ini.get('condition_baseline', '')): parser.USE_CCLPARSER = False if bool(ini.get('condition_no_boilerplate', '')) or \ bool(ini.get('condition_baseline', '')): html_to_trec.USE_BOILERPLATE = False records = read_groundtruth(ini.get('ground_truth')) writer = open(ini.get('train_path'), 'w') for query_str, good_text in records: tmp_folder = ini.get('tmp_folder', '/tmp') print 'query:', query_str (htmls, html_urls) = query_web_search(query_str, ini) print "found", len(htmls), "pages" instances = gen_nugget_train(ini, htmls, query_str, good_text) for candidate, is_good, features in instances: writer.write('%d,%s#%s\n' % (is_good, ','.join( map(lambda feature: str(feature), features)), '%s:%s' % (query_str, candidate))) writer.flush() try: rmtree(tmp_folder) os.mkdir(tmp_folder) except Exception as e: print e writer.close()
for file_name in file("%s/%s.MAND.tsv" % (ntcir_urls_folder, query_id,)): full_name = "%s/%s" % (ntcir_htmls_folder, file_name.strip()) url = "file://%s" % (full_name,) rid = web_search.copy_existing_page(cursor, cache_files_folder, full_name, url) web_search.ensure_page_query_link(cursor, cache_files_folder, qid, rank, url) rank += 1 if __name__ == '__main__': #### # input: ini file and queries file # ini_file = sys.argv[1] query_file = sys.argv[2] run_number = int(sys.argv[3]) ini = load_ini(ini_file) queries = load_queries(query_file) ini['search_engine'] = 'NTCIR' # these entries must be defined in the ini file system_description = ini['ntcir_system_description'] team_name = ini['ntcir_team_name'] ntcir_urls_folder = ini['ntcir_urls_folder'] ntcir_htmls_folder = ini['ntcir_htmls_folder'] base_tmp = ini.get('tmp_folder', './tmp') cache_folder = ini.get('cache_folder', "%s/cache" % (base_tmp,)) (conn, cursor) = web_search.open_db(cache_folder)
results[output_type] = assemble_output(final_passages_scored, final_length) return (results, html_urls) if __name__ == '__main__': #### # input: ini file and query # import time t0 = time.time() ini_file = sys.argv[1] query_str = " ".join(sys.argv[2:]) ini = load_ini(ini_file) (results, html_urls) = one_click_search(ini, query_str, [(1000, 'DESKTOP'), (140, 'TWITTER'), (280, 'MOBILE')]) tmp_folder = ini.get('tmp_folder', './tmp') output_file = "%s/out" % (tmp_folder, ) output = file(output_file, 'w') for output_type in results: output_text = results[output_type][0] output.write("<%s>%s</%s>\n" % (output_type, output_text, output_type)) output.close() print 'time:', time.time() - t0