예제 #1
0
def getSearchEngineResult(query_dict):
    result_dict = {}
    ix = index.open_dir("index")

    # with ix.searcher(weighting=scoring.BM25F()) as searcher:
    with ix.searcher(weighting=scoring.ScoringFunction()) as searcher:
        # TODO - Define your own query parser
        parser = QueryParser("contents",
                             schema=ix.schema,
                             group=OrGroup.factory(0))
        stemmizer = LancasterStemmer()
        stopWords = set(stopwords.words('english'))

        # print(stopWords)
        for qid, q in query_dict.items():

            table = str.maketrans('\n?.,!', '     ')
            q_nomark = q.translate(table)

            new_q = ''
            for word in q_nomark.split(' '):
                if word.lower() not in stopWords:
                    word_stem = stemmizer.stem(word.lower())
                    new_q += word_stem + ' '
            # print(new_q)
            query = parser.parse(new_q.lower())
            results = searcher.search(query, limit=None)
            # for result in results:
            #     print(result.fields()['docID'], result.score)

            result_dict[qid] = [result.fields()['docID'] for result in results]
    return result_dict
예제 #2
0
    def search(self, text: str, limit: int, timelimit=3.0):
        with self.index.searcher() as searcher:
            or_group = OrGroup.factory(.9)
            parser = MultifieldParser(['content', 'quiz_bowl'],
                                      schema=self.schema,
                                      group=or_group)
            text_query = parser.parse(text)
            collector = searcher.collector(limit=limit)
            tlc = TimeLimitCollector(collector, timelimit=timelimit)
            partial = True
            try:
                searcher.search_with_collector(text_query, tlc)
                partial = False
            except searching.TimeLimit:
                pass

            # There is a bug in whoosh that makes calling len directory or indirectly fail
            # which is why we don't use list()
            results = [(r['page'], r.score) for r in tlc.results()]

            # Doing logging using partial instead of directory is required due to a mysterious race
            # condition between whoosh time limits and log.info. Its important that all of whoosh's
            # functions including search_with_collector() and tlc.results() are called before
            # logging anything
            if partial:
                log.info(
                    'Search took longer than {}s, getting partial results'.
                    format(timelimit))

            if len(results) == 0:
                return [('<UNK_ANSWER>', 0)]

            return results
예제 #3
0
    def __init__(self,
                 index_dir,
                 index_name='dialogues',
                 context_size=1,
                 rewrite=False,
                 num_candidates=5,
                 slot_detector=None):
        '''
        Load index from index_dir or build it from dialogues.
        context_size: number of previous utterances to include
        '''
        self.slot_detector = slot_detector
        self.index_name = index_name
        if not index.exists_in(index_dir, indexname=index_name) or rewrite:
            if not os.path.exists(index_dir):
                print 'Create index in', index_dir
                os.makedirs(index_dir)
            elif rewrite:
                print 'Rewrite index in', index_dir
                shutil.rmtree(index_dir)
                os.makedirs(index_dir)
            self.ix = index.create_in(index_dir,
                                      schema=DialogueSchema,
                                      indexname=index_name)
            self.loaded_index = False
        else:
            print 'Load index from', index_dir
            self.ix = index.open_dir(index_dir, indexname=index_name)
            self.loaded_index = True
        self.context_size = context_size
        self.parser_icontext = QueryParser('immediate_context',
                                           schema=self.ix.schema,
                                           group=OrGroup.factory(0.9))
        self.parser_pcontext = QueryParser('prev_context',
                                           schema=self.ix.schema,
                                           group=OrGroup.factory(0.9))
        self.parser_title = QueryParser('title',
                                        schema=self.ix.schema,
                                        group=OrGroup.factory(0.9))

        self.num_candidates = num_candidates
        self.empty_candidates = [{} for _ in xrange(self.num_candidates)]

        self.search_time = 0.
        self.num_query = 0.
        self.num_empty = 0
예제 #4
0
def before_first_request():
    global parser, searcher
    index = load_index()
    og = OrGroup.factory(0.9)
    parser = QueryParser("text", schema=index.schema, group=og)
    searcher = index.searcher()

    app.logger.addHandler(logging.StreamHandler(sys.stdout))
    app.logger.setLevel(logging.INFO)
예제 #5
0
파일: __init__.py 프로젝트: mvdbeek/galaxy
    def search(self, q: str, tool_name_boost: CanConvertToFloat,
               tool_id_boost: CanConvertToFloat,
               tool_section_boost: CanConvertToFloat,
               tool_description_boost: CanConvertToFloat,
               tool_label_boost: CanConvertToFloat,
               tool_stub_boost: CanConvertToFloat,
               tool_help_boost: CanConvertToFloat,
               tool_search_limit: CanConvertToFloat,
               tool_enable_ngram_search: bool,
               tool_ngram_minsize: CanConvertToInt,
               tool_ngram_maxsize: CanConvertToInt) -> List[str]:
        """
        Perform search on the in-memory index. Weight in the given boosts.
        """
        # Change field boosts for searcher
        self.searcher = self.index.searcher(weighting=MultiWeighting(
            BM25F(),
            old_id=BM25F(old_id_B=float(tool_id_boost)),
            name=BM25F(name_B=float(tool_name_boost)),
            section=BM25F(section_B=float(tool_section_boost)),
            description=BM25F(description_B=float(tool_description_boost)),
            labels=BM25F(labels_B=float(tool_label_boost)),
            stub=BM25F(stub_B=float(tool_stub_boost)),
            help=BM25F(help_B=float(tool_help_boost))))
        # Use OrGroup to change the default operation for joining multiple terms to logical OR.
        # This means e.g. for search 'bowtie of king arthur' a document that only has 'bowtie' will be a match.
        # https://whoosh.readthedocs.io/en/latest/api/qparser.html#whoosh.qparser.MultifieldPlugin
        # However this changes scoring i.e. searching 'bowtie of king arthur' a document with 'arthur arthur arthur'
        # would have a higher score than a document with 'bowtie arthur' which is usually unexpected for a user.
        # Hence we introduce a bonus on multi-hits using the 'factory()' method using a scaling factor between 0-1.
        # https://whoosh.readthedocs.io/en/latest/parsing.html#searching-for-any-terms-instead-of-all-terms-by-default
        # Adding the FuzzyTermPlugin to account for misspellings and typos, using a max distance of 2
        og = OrGroup.factory(0.9)
        self.parser = MultifieldParser([
            'name', 'old_id', 'description', 'section', 'help', 'labels',
            'stub'
        ],
                                       schema=self.schema,
                                       group=og)

        cleaned_query = q.lower()
        if tool_enable_ngram_search is True:
            rval = self._search_ngrams(cleaned_query, tool_ngram_minsize,
                                       tool_ngram_maxsize, tool_search_limit)
            return rval
        else:
            cleaned_query = ' '.join(token.text
                                     for token in self.rex(cleaned_query))
            # Use asterisk Whoosh wildcard so e.g. 'bow' easily matches 'bowtie'
            parsed_query = self.parser.parse(f"*{cleaned_query}*")
            hits = self.searcher.search(parsed_query,
                                        limit=float(tool_search_limit),
                                        sortedby='')
            return [hit['id'] for hit in hits]
예제 #6
0
 def search(self, q, tool_name_boost, tool_section_boost,
            tool_description_boost, tool_label_boost, tool_stub_boost,
            tool_help_boost, tool_search_limit, tool_enable_ngram_search,
            tool_ngram_minsize, tool_ngram_maxsize):
     """
     Perform search on the in-memory index. Weight in the given boosts.
     """
     # Change field boosts for searcher
     self.searcher = self.index.searcher(weighting=BM25F(
         field_B={
             'name_B': float(tool_name_boost),
             'section_B': float(tool_section_boost),
             'description_B': float(tool_description_boost),
             'labels_B': float(tool_label_boost),
             'stub_B': float(tool_stub_boost),
             'help_B': float(tool_help_boost)
         }))
     # Use OrGroup to change the default operation for joining multiple terms to logical OR.
     # This means e.g. for search 'bowtie of king arthur' a document that only has 'bowtie' will be a match.
     # https://whoosh.readthedocs.io/en/latest/api/qparser.html#whoosh.qparser.MultifieldPlugin
     # However this changes scoring i.e. searching 'bowtie of king arthur' a document with 'arthur arthur arthur'
     # would have a higher score than a document with 'bowtie arthur' which is usually unexpected for a user.
     # Hence we introduce a bonus on multi-hits using the 'factory()' method using a scaling factor between 0-1.
     # https://whoosh.readthedocs.io/en/latest/parsing.html#searching-for-any-terms-instead-of-all-terms-by-default
     og = OrGroup.factory(0.9)
     self.parser = MultifieldParser(
         ['name', 'description', 'section', 'help', 'labels', 'stub'],
         schema=self.schema,
         group=og)
     cleaned_query = q.lower()
     # Replace hyphens, since they are wildcards in Whoosh causing false positives
     if cleaned_query.find('-') != -1:
         cleaned_query = (' ').join(
             token.text for token in self.rex(to_unicode(cleaned_query)))
     if tool_enable_ngram_search is True:
         rval = self._search_ngrams(cleaned_query, tool_ngram_minsize,
                                    tool_ngram_maxsize, tool_search_limit)
         return rval
     else:
         # Use asterisk Whoosh wildcard so e.g. 'bow' easily matches 'bowtie'
         parsed_query = self.parser.parse(cleaned_query + '*')
         hits = self.searcher.search(parsed_query,
                                     limit=float(tool_search_limit),
                                     sortedby='')
         return [hit['id'] for hit in hits]
예제 #7
0
def getRandom15SearchResult(query_dict):
    """
    Get Random Search results for checking variance, max and min value of BPREF
    Query Number 8,50,59 should be avoided hence those queries are only related to one document.
    :param query_dict: standard query_dict
    :return: result dictionary which contains every info about search result
    """
    result_dict = {}
    tot_result_dict ={}
    ix = index.open_dir("index")

    with ix.searcher(weighting=scoring.ScoringFunction()) as searcher:
        parser = QueryParser("contents", schema=ix.schema, group=OrGroup.factory(0.4))

        from whoosh.lang.porter2 import stem as ps2
        query_dict = {k: split_stem(stem_fn=ps2, value=v) for k, v in query_dict.items()}

        import random
        r = [i for i in range(1, 94) if i not in [8, 50, 59]]
        rand_15 = random.sample(r, 15)

        for qid in rand_15:
            q = query_dict[qid]
            query = parser.parse(q)
            results = searcher.search(query, limit=None)
            result_dict[qid] = [result.fields()['docID'] for result in results]

            query_expansion = parse_document(result_dict[qid], 3, 2)
            expand_set = set([ps2(word[0]) for word in query_expansion]).union(
                set([word[0] for word in query_expansion]))
            q_set = set(q.split())

            unique_set = expand_set - q_set
            q += " " + " ".join(unique_set)

            tot_query = parser.parse(q)
            tot_results = searcher.search(tot_query, limit=None)

            tot_result_dict[qid] = [result.fields()['docID'] for result in tot_results]

        print("Random 15 queries: " + ",".join(str(x) for x in rand_15))
    return tot_result_dict
예제 #8
0
def search_index(query, score_func_name, dirname):
    ix = index.open_dir(dirname, schema=get_schema())
    og = OrGroup.factory(0.9)
    qp = QueryParser("content", schema=get_schema(), group=og)
    # qp.add_plugin(FuzzyTermPlugin())
    # query = ' '.join([(x + '~' if len(x) > 5 else x) for x in query.split(' ')])
    q = qp.parse(query)
    score_func = OkBM25()
    if score_func_name == 'ok':
        score_func = OkBM25()
    elif score_func_name == 'bm25f':
        score_func = BM25F()
    elif score_func_name == 'pln':
        score_func = PLN()
    elif score_func_name == 'tfidf':
        score_func = TF_IDF()
    elif score_func_name == 'freq':
        score_func = Frequency()
    searcher = ix.searcher(weighting=score_func)
    results = searcher.search(q, limit=None)
    results.fragmenter.surround = 100
    return results
예제 #9
0
"""Set up app config."""

import os
from pathlib import Path

from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser, OrGroup

WHOOSH_INDEX_DIR = Path(os.getenv("WHOOSH_INDEX_DIR", "whoosh_index"))
SEARCH_INDEX = open_dir(WHOOSH_INDEX_DIR)
QUERY_PARSER = MultifieldParser(["name", "artist_name"],
                                SEARCH_INDEX.schema,
                                group=OrGroup.factory(0.9))
예제 #10
0
# 方法一 使用FileStorage对象
from whoosh.filedb.filestore import FileStorage
storage = FileStorage('index')  # idx_path 为索引路径
idx1 = storage.open_index(indexname='idx1')

from whoosh import index
# 方法二 使用open_dir函数
from whoosh.index import open_dir
idx2 = open_dir('index', indexname='idx2')  # indexname 为索引名
print(index.exists_in('index', indexname='idx2'))
pass

from whoosh.qparser import QueryParser, MultifieldParser, OrGroup, FieldsPlugin

og = OrGroup.factory(0.9)

qp = QueryParser("content", schema=idx1.schema)  # group=OrGroup
qp.remove_plugin_class(FieldsPlugin)
q = qp.parse("reset")
print(q)
# mqp = MultifieldParser(["title", "content"], schema=idx1.schema)
# mq = mqp.parse(u"many only")
#
# from whoosh.query import *
# myquery = And([Term("title", u"third"), q])
# # myquery = Term("title", u"ird")
# print(myquery)
searcher = idx1.searcher()
r = (searcher.search(q=q, limit=None))
print(len(r))
for hit in r: