Пример #1
0
def identify_candidates(passages, main_passage_count, top_documents):
    potential_candidates = dict()
    stopwords = nltk.corpus.stopwords.words("english")

    # parse all passages
    seen_documents = set()
    # print len(passages)
    processed_passages = 0
    for idx in xrange(0, len(passages)):
        if int(passages[idx][0]["document"]) > top_documents:
            continue
        if passages[idx][0]["document"] in seen_documents:
            continue
        if processed_passages > main_passage_count:
            break
        processed_passages += 1
        seen_documents.add(passages[idx][0]["document"])
        passage_text = passages[idx][1]
        chunks = []
        if USE_PATTERNS:
            chunks += parse_pattern_chunks(passage_text)
        chunks += parse_into_chunks(passage_text)
        passage_counted = set()
        for chunk in chunks:
            chunk = (chunk[0], map(lambda x: re.sub("[^A-Za-z0-9]", " ", x).strip(), chunk[1]))
            as_str = " ".join(chunk[1]).strip()

            if as_str.lower() in stopwords:
                continue

            info = potential_candidates.get(as_str, dict())
            info["tokens"] = chunk[1]
            info["type"] = "NE" if info.get("type", "Non-NE") == "NE" else chunk[0]  # once NE, always NE
            info["score"] = passages[idx][0]["score"] + 100.0 + info.get("score", 0.0)
            info["total_count"] = 1 + info.get("total_count", 1)
            if not as_str in passage_counted:
                info["passage_count"] = 1 + info.get("passage_count", 1)
                evidence = info.get("passages", list())
                evidence.append(passages[idx])
                info["passages"] = evidence
                passage_counted.add(as_str)

            potential_candidates[as_str] = info
        # print '=== passage =>', re.sub('\s+', ' ', passages[idx][1]), '|=== candidates =>', passage_counted

    # keep all NEs plus "important" Non-NEs (might need corpus frequencies for that)
    result_candidates = []
    result_evidence = dict()
    for potential in potential_candidates:
        entry = potential_candidates[potential]
        if True:  # entry['type'] == 'NE' or entry['passage_count'] > entry['total_count'] * 0.8:
            # print entry['type'], potential
            result_candidates.append((potential, entry["tokens"]))
            result_evidence[potential] = {"type": entry["type"], "score": entry["score"], "passages": entry["passages"]}
    return (result_candidates, result_evidence)
Пример #2
0
def gen_nugget_train(ini, htmls, query_str, good_text):
    from nugget_finder import load_ini, do_search, identify_candidates
    
    tmp_folder = ini.get('tmp_folder', './tmp')
    good_text = good_text.lower()

    ####
    # extract text from the HTML documents
    #
    sys.stderr.write("Extracting text...\n")
    path_to_corpus = "%s/to_index" % (tmp_folder,)
    if not os.path.exists(path_to_corpus):
        os.makedirs(path_to_corpus)
        
    html_count = 0
    for html in htmls:
        outfile = "%s/%s.txt" % (path_to_corpus, html_count)
        cached_detag = "%s.txt" % (html,)
        if os.path.exists(cached_detag):
            copyfile(cached_detag, outfile)
        else:
            detag_html_file(infile=html,outfile=outfile,id=html_count)
            copyfile(outfile, cached_detag)
        html_count += 1

    ####
    # build index
    #
    sys.stderr.write("Indexing...\n")
    path_to_index = "%s/index" % (tmp_folder,)
    if not os.path.exists(path_to_index):
        os.makedirs(path_to_index)
        
    config_template = file(ini.get('index_config_template', "./indexing.template")).read()
    config_filename = "%s/indexing.param" % (tmp_folder,)
    config_file = open(config_filename, "w")
    config_file.write(config_template.format(path_to_corpus=path_to_corpus,
                                             path_to_index=path_to_index))
    config_file.close()
    index_command = ini.get('index_command', 'IndriBuildIndex')

    retcode=subprocess.call([ index_command, config_filename ], stdout=sys.stderr, stderr=sys.stderr)
    assert retcode==0

    ####
    # generate query
    #
    parsed_query = parse_into_chunks(query_str)

    if bool(ini.get('condition_baseline', '')):
        print "baseline run."
        return ([], parsed_query, path_to_index)

    ####
    # main search
    #
    sys.stderr.write("Main search...\n")
    search_command = ini.get('search_command', './cpp/Search')    
    main_passages = do_search(parsed_query, search_command, path_to_index, 2000)
    #print 'passage num:', len(main_passages)

    ####
    # identify candidates
    #
    sys.stderr.write("Identifying candidates...\n")
    top_documents = int(ini.get('top_documents_for_candidate', '20'))
    candidates, main_evidence = identify_candidates(main_passages,
                                                    int(ini.get('main_search_passage_count', 3)),
                                                    top_documents)
    print 'candidate num:', len(candidates)

    ###
    # evidence search
    #
    sys.stderr.write("Evidence searching...\n")
    evidence = dict()
    t0 = time.time()
    searcher = Searcher(search_command, path_to_index,
                                      int(ini.get('evidence_search_passage_count', 10)))
    p = Pool(8)
    queries = map(lambda candidate: list(parsed_query) + [('NE', candidate[1] )], candidates)
    evidence_passages_list = p.map(searcher, queries, 50)
    p.close()
    print 'pool map evidence search:', time.time() - t0;
    for i in xrange(len(candidates)):
        candidate = candidates[i]
        evidence[candidate[0]] = filter(lambda passage: 
                                        all(map(lambda token: token.lower() in passage[1].lower(), candidate[1])), evidence_passages_list[i])

    ####
    # evaluate evidence
    #
    sys.stderr.write("Generating Training...\n")
    instances = []
    total = len(evidence)
    t0 = time.time()
    gen = TrainGenerator(ini.get('dumpindex_command', 'dumpindex'), ini.get('stat_index'))
    inputs = map(lambda candidate: (candidate, evidence[candidate], main_evidence[candidate], good_text), evidence.keys()) 
    p = Pool(8)
    instances = p.map(gen, inputs, 50)
    p.close()
    print 'pool map evaluating:', time.time() - t0

    ####
    # clean up
    #
    for i in xrange(0, html_count):
        try:
            os.unlink("%s/to_index/%s.txt" % (tmp_folder, i))
        except:
            pass

    return instances
Пример #3
0
def find_nuggets(ini, htmls, query_str):
    tmp_folder = ini.get("tmp_folder", "./tmp")

    ####
    # extract text from the HTML documents
    #
    sys.stderr.write("Extracting text...\n")
    path_to_corpus = "%s/to_index" % (tmp_folder,)
    if not os.path.exists(path_to_corpus):
        os.makedirs(path_to_corpus)

    html_count = 0
    for html in htmls:
        outfile = "%s/%s.txt" % (path_to_corpus, html_count)
        cached_detag = "%s.txt" % (html,)
        if os.path.exists(cached_detag):
            copyfile(cached_detag, outfile)
        else:
            detag_html_file(infile=html, outfile=outfile, id=html_count)
            copyfile(outfile, cached_detag)
        html_count += 1

    ####
    # build index
    #
    sys.stderr.write("Indexing...\n")
    path_to_index = "%s/index" % (tmp_folder,)
    if not os.path.exists(path_to_index):
        os.makedirs(path_to_index)

    config_template = file(ini.get("index_config_template", "./indexing.template")).read()
    config_filename = "%s/indexing.param" % (tmp_folder,)
    config_file = open(config_filename, "w")
    config_file.write(config_template.format(path_to_corpus=path_to_corpus, path_to_index=path_to_index))
    config_file.close()
    index_command = ini.get("index_command", "IndriBuildIndex")

    retcode = subprocess.call([index_command, config_filename], stdout=sys.stderr, stderr=sys.stderr)
    assert retcode == 0

    ####
    # generate query
    #
    parsed_query = parse_into_chunks(query_str)

    if bool(ini.get("condition_baseline", "")):
        print "baseline run."
        return ([], parsed_query, path_to_index)

    ####
    # main search
    #
    sys.stderr.write("Main search...\n")
    search_command = ini.get("search_command", "./cpp/Search")
    main_passages = do_search(parsed_query, search_command, path_to_index, 2000)

    ####
    # identify candidates
    #
    sys.stderr.write("Identifying candidates...\n")
    top_documents = int(ini.get("top_documents_for_candidate", "20"))
    candidates, main_evidence = identify_candidates(
        main_passages, int(ini.get("main_search_passage_count", 3)), top_documents
    )

    ###
    # evidence search
    #
    sys.stderr.write("Evidence searching...\n")
    evidence = dict()
    # for candidate in candidates:
    # extended_query = list(parsed_query)
    # extended_query.append( ('NE', candidate[1] ) )
    # evidence_passages = do_search(extended_query, search_command, path_to_index,
    # int(ini.get('evidence_search_passage_count', 10)))
    # evidence[candidate[0]] = filter(lambda passage:
    # all(map(lambda token: token.lower() in passage[1].lower(),
    # candidate[1])), evidence_passages)
    # sys.stderr.write('Found %d passages\n' % (len(evidence[candidate[0]]),))

    searcher = Searcher(search_command, path_to_index, int(ini.get("evidence_search_passage_count", 10)))
    print "candidate num:%d" % len(candidates)
    queries = map(lambda candidate: list(parsed_query) + [("NE", candidate[1])], candidates)
    evidence_passages_list = fastmap.fastmap(searcher, 10, queries)
    for i in xrange(len(candidates)):
        candidate = candidates[i]
        evidence[candidate[0]] = filter(
            lambda passage: all(map(lambda token: token.lower() in passage[1].lower(), candidate[1])),
            evidence_passages_list[i],
        )

    ####
    # evaluate evidence
    #
    sys.stderr.write("Evaluating evidence...\n")
    scored_candidates = list()

    if USE_CANDIDATE_SCORER:
        pool_scorer = PoolScorer(ini)
        p = Pool(8)
        scorerd_candidates = fastmap.fastmap(
            pool_scorer,
            map(
                lambda candidate: (candidate, evidence[candidate], main_evidence[candidate], parsed_query),
                evidence.keys(),
            ),
        )
        p.close()
    else:
        for candidate in evidence:
            scored_candidates.append(
                (
                    candidate,
                    score_candidate(candidate, evidence[candidate], main_evidence[candidate], parsed_query),
                    evidence[candidate],
                )
            )
    ####
    # clean up
    #
    if False:
        for i in xrange(0, html_count):
            try:
                os.unlink("%s/to_index/%s.txt" % (tmp_folder, i))
            except:
                pass

    ####
    # show candidates
    #
    if False:
        scored_candidates.sort(key=itemgetter(1), reverse=True)
        rank = 0
        for candidate_score in scored_candidates:
            candidate, score, evidence = candidate_score
            print candidate
            print "\t", rank, score
            # printed = set()
            # for entry in evidence:
            #    if not entry[0]['document'] in printed:
            #        print entry[0]['document'], entry[0]['score']
            #        printed.add(entry[0]['document'])
            #        print ""
            rank += 1

    return (scored_candidates, parsed_query, path_to_index)
Пример #4
0
def gen_nugget_train(ini, htmls, query_str, good_text):
    from nugget_finder import load_ini, do_search, identify_candidates

    tmp_folder = ini.get('tmp_folder', './tmp')
    good_text = good_text.lower()

    ####
    # extract text from the HTML documents
    #
    sys.stderr.write("Extracting text...\n")
    path_to_corpus = "%s/to_index" % (tmp_folder, )
    if not os.path.exists(path_to_corpus):
        os.makedirs(path_to_corpus)

    html_count = 0
    for html in htmls:
        outfile = "%s/%s.txt" % (path_to_corpus, html_count)
        cached_detag = "%s.txt" % (html, )
        if os.path.exists(cached_detag):
            copyfile(cached_detag, outfile)
        else:
            detag_html_file(infile=html, outfile=outfile, id=html_count)
            copyfile(outfile, cached_detag)
        html_count += 1

    ####
    # build index
    #
    sys.stderr.write("Indexing...\n")
    path_to_index = "%s/index" % (tmp_folder, )
    if not os.path.exists(path_to_index):
        os.makedirs(path_to_index)

    config_template = file(
        ini.get('index_config_template', "./indexing.template")).read()
    config_filename = "%s/indexing.param" % (tmp_folder, )
    config_file = open(config_filename, "w")
    config_file.write(
        config_template.format(path_to_corpus=path_to_corpus,
                               path_to_index=path_to_index))
    config_file.close()
    index_command = ini.get('index_command', 'IndriBuildIndex')

    retcode = subprocess.call([index_command, config_filename],
                              stdout=sys.stderr,
                              stderr=sys.stderr)
    assert retcode == 0

    ####
    # generate query
    #
    parsed_query = parse_into_chunks(query_str)

    if bool(ini.get('condition_baseline', '')):
        print "baseline run."
        return ([], parsed_query, path_to_index)

    ####
    # main search
    #
    sys.stderr.write("Main search...\n")
    search_command = ini.get('search_command', './cpp/Search')
    main_passages = do_search(parsed_query, search_command, path_to_index,
                              2000)
    #print 'passage num:', len(main_passages)

    ####
    # identify candidates
    #
    sys.stderr.write("Identifying candidates...\n")
    top_documents = int(ini.get('top_documents_for_candidate', '20'))
    candidates, main_evidence = identify_candidates(
        main_passages, int(ini.get('main_search_passage_count', 3)),
        top_documents)
    print 'candidate num:', len(candidates)

    ###
    # evidence search
    #
    sys.stderr.write("Evidence searching...\n")
    evidence = dict()
    t0 = time.time()
    searcher = Searcher(search_command, path_to_index,
                        int(ini.get('evidence_search_passage_count', 10)))
    p = Pool(8)
    queries = map(
        lambda candidate: list(parsed_query) + [('NE', candidate[1])],
        candidates)
    evidence_passages_list = p.map(searcher, queries, 50)
    p.close()
    print 'pool map evidence search:', time.time() - t0
    for i in xrange(len(candidates)):
        candidate = candidates[i]
        evidence[candidate[0]] = filter(
            lambda passage: all(
                map(lambda token: token.lower() in passage[1].lower(),
                    candidate[1])), evidence_passages_list[i])

    ####
    # evaluate evidence
    #
    sys.stderr.write("Generating Training...\n")
    instances = []
    total = len(evidence)
    t0 = time.time()
    gen = TrainGenerator(ini.get('dumpindex_command', 'dumpindex'),
                         ini.get('stat_index'))
    inputs = map(
        lambda candidate:
        (candidate, evidence[candidate], main_evidence[candidate], good_text),
        evidence.keys())
    p = Pool(8)
    instances = p.map(gen, inputs, 50)
    p.close()
    print 'pool map evaluating:', time.time() - t0

    ####
    # clean up
    #
    for i in xrange(0, html_count):
        try:
            os.unlink("%s/to_index/%s.txt" % (tmp_folder, i))
        except:
            pass

    return instances