def one_click_search(ini, query_str, outputs):

    if bool(ini.get('condition_no_cclparser', '')) or \
           bool(ini.get('condition_baseline', '')):
        parser.USE_CCLPARSER = False

    if bool(ini.get('condition_no_boilerplate', '')) or \
           bool(ini.get('condition_baseline', '')):
        html_to_trec.USE_BOILERPLATE = False

    if bool(ini.get('condition_patterns', '')):
        nugget_finder.USE_PATTERNS = True

    if bool(ini.get('condition_candidate_scorer', '')):
        nugget_finder.USE_CANDIDATE_SCORER = True

    if bool(ini.get('condition_no_ilp', '')):
        USE_ILP = False
    else:
        USE_ILP = True

    ####
    # fetch results from Web search engine (or cache)
    #
    (htmls, html_urls) = query_web_search(query_str, ini)
    print "found", len(htmls), "pages"

    ####
    # extract relevant nuggets
    #
    (scored_candidates, parsed_query, path_to_index) = find_nuggets(ini, htmls, query_str)

    ####
    # final output
    #
    final_passages = do_search(parsed_query, ini.get('search_command', './cpp/Search'),
                               path_to_index, int(ini.get('main_search_passage_count', 3)))
    results = {}
    if USE_ILP:
        ####
        # assemble final output
        #
        for (final_length, output_type) in outputs:
            results[output_type] = assemble_output_ilp(final_passages, scored_candidates, final_length)
    else:
        ####
        # score final passages
        #
        final_passages_scored = score_passages(final_passages, scored_candidates)

        ####
        # assemble final output
        #
        for (final_length, output_type) in outputs:
            results[output_type] = assemble_output(final_passages_scored, final_length)
    
    return (results, html_urls)
示例#2
0
    def __call__(self, query):
        from nugget_finder import load_ini, do_search, identify_candidates

        return do_search(query, self.search_command, self.index_path,
                         self.ret_size)
 def __call__(self, query):
     from nugget_finder import load_ini, do_search, identify_candidates
     
     return do_search(query, self.search_command, self.index_path, self.ret_size)
def gen_nugget_train(ini, htmls, query_str, good_text):
    from nugget_finder import load_ini, do_search, identify_candidates
    
    tmp_folder = ini.get('tmp_folder', './tmp')
    good_text = good_text.lower()

    ####
    # extract text from the HTML documents
    #
    sys.stderr.write("Extracting text...\n")
    path_to_corpus = "%s/to_index" % (tmp_folder,)
    if not os.path.exists(path_to_corpus):
        os.makedirs(path_to_corpus)
        
    html_count = 0
    for html in htmls:
        outfile = "%s/%s.txt" % (path_to_corpus, html_count)
        cached_detag = "%s.txt" % (html,)
        if os.path.exists(cached_detag):
            copyfile(cached_detag, outfile)
        else:
            detag_html_file(infile=html,outfile=outfile,id=html_count)
            copyfile(outfile, cached_detag)
        html_count += 1

    ####
    # build index
    #
    sys.stderr.write("Indexing...\n")
    path_to_index = "%s/index" % (tmp_folder,)
    if not os.path.exists(path_to_index):
        os.makedirs(path_to_index)
        
    config_template = file(ini.get('index_config_template', "./indexing.template")).read()
    config_filename = "%s/indexing.param" % (tmp_folder,)
    config_file = open(config_filename, "w")
    config_file.write(config_template.format(path_to_corpus=path_to_corpus,
                                             path_to_index=path_to_index))
    config_file.close()
    index_command = ini.get('index_command', 'IndriBuildIndex')

    retcode=subprocess.call([ index_command, config_filename ], stdout=sys.stderr, stderr=sys.stderr)
    assert retcode==0

    ####
    # generate query
    #
    parsed_query = parse_into_chunks(query_str)

    if bool(ini.get('condition_baseline', '')):
        print "baseline run."
        return ([], parsed_query, path_to_index)

    ####
    # main search
    #
    sys.stderr.write("Main search...\n")
    search_command = ini.get('search_command', './cpp/Search')    
    main_passages = do_search(parsed_query, search_command, path_to_index, 2000)
    #print 'passage num:', len(main_passages)

    ####
    # identify candidates
    #
    sys.stderr.write("Identifying candidates...\n")
    top_documents = int(ini.get('top_documents_for_candidate', '20'))
    candidates, main_evidence = identify_candidates(main_passages,
                                                    int(ini.get('main_search_passage_count', 3)),
                                                    top_documents)
    print 'candidate num:', len(candidates)

    ###
    # evidence search
    #
    sys.stderr.write("Evidence searching...\n")
    evidence = dict()
    t0 = time.time()
    searcher = Searcher(search_command, path_to_index,
                                      int(ini.get('evidence_search_passage_count', 10)))
    p = Pool(8)
    queries = map(lambda candidate: list(parsed_query) + [('NE', candidate[1] )], candidates)
    evidence_passages_list = p.map(searcher, queries, 50)
    p.close()
    print 'pool map evidence search:', time.time() - t0;
    for i in xrange(len(candidates)):
        candidate = candidates[i]
        evidence[candidate[0]] = filter(lambda passage: 
                                        all(map(lambda token: token.lower() in passage[1].lower(), candidate[1])), evidence_passages_list[i])

    ####
    # evaluate evidence
    #
    sys.stderr.write("Generating Training...\n")
    instances = []
    total = len(evidence)
    t0 = time.time()
    gen = TrainGenerator(ini.get('dumpindex_command', 'dumpindex'), ini.get('stat_index'))
    inputs = map(lambda candidate: (candidate, evidence[candidate], main_evidence[candidate], good_text), evidence.keys()) 
    p = Pool(8)
    instances = p.map(gen, inputs, 50)
    p.close()
    print 'pool map evaluating:', time.time() - t0

    ####
    # clean up
    #
    for i in xrange(0, html_count):
        try:
            os.unlink("%s/to_index/%s.txt" % (tmp_folder, i))
        except:
            pass

    return instances
示例#5
0
def one_click_search(ini, query_str, outputs):

    if bool(ini.get('condition_no_cclparser', '')) or \
           bool(ini.get('condition_baseline', '')):
        parser.USE_CCLPARSER = False

    if bool(ini.get('condition_no_boilerplate', '')) or \
           bool(ini.get('condition_baseline', '')):
        html_to_trec.USE_BOILERPLATE = False

    if bool(ini.get('condition_patterns', '')):
        nugget_finder.USE_PATTERNS = True

    if bool(ini.get('condition_candidate_scorer', '')):
        nugget_finder.USE_CANDIDATE_SCORER = True

    if bool(ini.get('condition_no_ilp', '')):
        USE_ILP = False
    else:
        USE_ILP = True

    ####
    # fetch results from Web search engine (or cache)
    #
    (htmls, html_urls) = query_web_search(query_str, ini)
    print "found", len(htmls), "pages"

    ####
    # extract relevant nuggets
    #
    (scored_candidates, parsed_query,
     path_to_index) = find_nuggets(ini, htmls, query_str)

    ####
    # final output
    #
    final_passages = do_search(parsed_query,
                               ini.get('search_command', './cpp/Search'),
                               path_to_index,
                               int(ini.get('main_search_passage_count', 3)))
    results = {}
    if USE_ILP:
        ####
        # assemble final output
        #
        for (final_length, output_type) in outputs:
            results[output_type] = assemble_output_ilp(final_passages,
                                                       scored_candidates,
                                                       final_length)
    else:
        ####
        # score final passages
        #
        final_passages_scored = score_passages(final_passages,
                                               scored_candidates)

        ####
        # assemble final output
        #
        for (final_length, output_type) in outputs:
            results[output_type] = assemble_output(final_passages_scored,
                                                   final_length)

    return (results, html_urls)
示例#6
0
def gen_nugget_train(ini, htmls, query_str, good_text):
    from nugget_finder import load_ini, do_search, identify_candidates

    tmp_folder = ini.get('tmp_folder', './tmp')
    good_text = good_text.lower()

    ####
    # extract text from the HTML documents
    #
    sys.stderr.write("Extracting text...\n")
    path_to_corpus = "%s/to_index" % (tmp_folder, )
    if not os.path.exists(path_to_corpus):
        os.makedirs(path_to_corpus)

    html_count = 0
    for html in htmls:
        outfile = "%s/%s.txt" % (path_to_corpus, html_count)
        cached_detag = "%s.txt" % (html, )
        if os.path.exists(cached_detag):
            copyfile(cached_detag, outfile)
        else:
            detag_html_file(infile=html, outfile=outfile, id=html_count)
            copyfile(outfile, cached_detag)
        html_count += 1

    ####
    # build index
    #
    sys.stderr.write("Indexing...\n")
    path_to_index = "%s/index" % (tmp_folder, )
    if not os.path.exists(path_to_index):
        os.makedirs(path_to_index)

    config_template = file(
        ini.get('index_config_template', "./indexing.template")).read()
    config_filename = "%s/indexing.param" % (tmp_folder, )
    config_file = open(config_filename, "w")
    config_file.write(
        config_template.format(path_to_corpus=path_to_corpus,
                               path_to_index=path_to_index))
    config_file.close()
    index_command = ini.get('index_command', 'IndriBuildIndex')

    retcode = subprocess.call([index_command, config_filename],
                              stdout=sys.stderr,
                              stderr=sys.stderr)
    assert retcode == 0

    ####
    # generate query
    #
    parsed_query = parse_into_chunks(query_str)

    if bool(ini.get('condition_baseline', '')):
        print "baseline run."
        return ([], parsed_query, path_to_index)

    ####
    # main search
    #
    sys.stderr.write("Main search...\n")
    search_command = ini.get('search_command', './cpp/Search')
    main_passages = do_search(parsed_query, search_command, path_to_index,
                              2000)
    #print 'passage num:', len(main_passages)

    ####
    # identify candidates
    #
    sys.stderr.write("Identifying candidates...\n")
    top_documents = int(ini.get('top_documents_for_candidate', '20'))
    candidates, main_evidence = identify_candidates(
        main_passages, int(ini.get('main_search_passage_count', 3)),
        top_documents)
    print 'candidate num:', len(candidates)

    ###
    # evidence search
    #
    sys.stderr.write("Evidence searching...\n")
    evidence = dict()
    t0 = time.time()
    searcher = Searcher(search_command, path_to_index,
                        int(ini.get('evidence_search_passage_count', 10)))
    p = Pool(8)
    queries = map(
        lambda candidate: list(parsed_query) + [('NE', candidate[1])],
        candidates)
    evidence_passages_list = p.map(searcher, queries, 50)
    p.close()
    print 'pool map evidence search:', time.time() - t0
    for i in xrange(len(candidates)):
        candidate = candidates[i]
        evidence[candidate[0]] = filter(
            lambda passage: all(
                map(lambda token: token.lower() in passage[1].lower(),
                    candidate[1])), evidence_passages_list[i])

    ####
    # evaluate evidence
    #
    sys.stderr.write("Generating Training...\n")
    instances = []
    total = len(evidence)
    t0 = time.time()
    gen = TrainGenerator(ini.get('dumpindex_command', 'dumpindex'),
                         ini.get('stat_index'))
    inputs = map(
        lambda candidate:
        (candidate, evidence[candidate], main_evidence[candidate], good_text),
        evidence.keys())
    p = Pool(8)
    instances = p.map(gen, inputs, 50)
    p.close()
    print 'pool map evaluating:', time.time() - t0

    ####
    # clean up
    #
    for i in xrange(0, html_count):
        try:
            os.unlink("%s/to_index/%s.txt" % (tmp_folder, i))
        except:
            pass

    return instances