Exemplo n.º 1
0
 def _parse_and_search(
         self,
         searcher: Searcher,
         content: str,
         value: str,
         limit: Optional[int] = None,
         terms: bool = False,
         group: Type[Union[AndGroup, OrGroup]] = OrGroup) -> Results:
     parser = QueryParser(content, self._ix.schema, group=group)
     parser.remove_plugin_class(FieldsPlugin)
     parser.remove_plugin_class(WildcardPlugin)
     query = parser.parse(value)
     return searcher.search(query, terms=terms, limit=limit)
Exemplo n.º 2
0
def search_index(words):
    xg_duanluo = []
    with ix.searcher() as s:
        qp = QueryParser('duanluo', schema=ix.schema, group=qparser.OrGroup)
        qp.remove_plugin_class(qparser.WildcardPlugin)
        qp.add_plugin(qparser.PrefixPlugin())

        for word in words:
            q = qp.parse(u'{}'.format(word))
            results = s.search(q, limit=10)
            for i in results:
                xg_duanluo.append((i['id'], i['duanluo']))
    return xg_duanluo
Exemplo n.º 3
0
def basic_search(query,
                 query_parse,
                 group=default_group,
                 facet=default_facet,
                 index=default_index):
    searcher = index.searcher()
    parser = QueryParser(query_parse, index.schema, group=group)
    myquery = parser.parse(query)
    parser.remove_plugin_class(qparser.PhrasePlugin)
    parser.add_plugin(qparser.SequencePlugin())
    parser.add_plugin(qparser.FuzzyTermPlugin())
    results = searcher.search(
        myquery, limit=None, sortedby=facet)  # limit为搜索结果的限制,默认为10,详见博客开头的官方文档
    print(results)
    return results
Exemplo n.º 4
0
def search_index(words):
    xg_words = []
    with ix.searcher() as s:
        qp = QueryParser('section', schema=ix.schema, group=qparser.OrGroup)

        # 可以使用通配符搜索
        qp.remove_plugin_class(qparser.WildcardPlugin)
        qp.add_plugin(qparser.PrefixPlugin())

        for word in words:
            q = qp.parse(u'{}'.format(word))
            results = s.search(q, limit=10)
            for i in results:
                xg_words.append(i['section'])
    return xg_words
Exemplo n.º 5
0
def search_index(words):
    xg_words = []
    with ix.searcher() as s:

        # group=qparser.OrGroup 表示可匹配任意查询词,而不是所有查询词都匹配才能出结果
        qp = QueryParser('section', schema=ix.schema, group=qparser.OrGroup)

        # 下面两行表示可以使用通配符搜索,如"窗前*月光"
        qp.remove_plugin_class(qparser.WildcardPlugin)
        qp.add_plugin(qparser.PrefixPlugin())

        for word in words:
            q = qp.parse(u'%s' % word)
            # limit:代表返回多少条搜索结果
            results = s.search(q, limit=10)
            for i in results:
                xg_words.append(i['section'])
                # print (word,i['section'])
    return xg_words
Exemplo n.º 6
0
    def __init__(self, index_dir, var_path):
        self._index = None
        try:
            self._index = wh_index.open_dir(index_dir)
        except wh_index.IndexError:
            raise IndexError

        self._var_reader = self._make_var_reader(var_path)

        op = OperatorsPlugin(
            And=r"\bAND\b|&", Or=None,  # r"\bOR\b|\|",
            Not=r"\bNOT\b|\s+-", AndMaybe=None, Require=None)
        parser = QueryParser('content', _schema,
                             termclass=my_variations(self._var_reader))
        parser.remove_plugin_class(RangePlugin)
        parser.remove_plugin_class(BoostPlugin)
        parser.remove_plugin_class(WildcardPlugin)
        parser.replace_plugin(op)
        self._parser = parser

        parser_wild = QueryParser('content', _schema,
                                  termclass=my_variations(self._var_reader))
        parser_wild.remove_plugin_class(RangePlugin)
        parser_wild.remove_plugin_class(BoostPlugin)
        parser_wild.replace_plugin(op)
        self._parser_wild = parser_wild

        op_filter = OperatorsPlugin(And=r"\bAND\b", Or=r"\bOR\b",
                                    Not=None, AndMaybe=None, Require=None)
        asf_parser = QueryParser('asfilter', _schema)
        asf_parser.replace_plugin(op_filter)
        self._asf_parser = asf_parser
Exemplo n.º 7
0
def search_index(words):
    xg_part = []
    with ix.searcher() as s:
        # group = qparser.OrGroup 表示可匹配任意查询词,而不是所有查询词都匹配才出结果
        qp = QueryParser('part', schema=ix.schema, group=qparser.OrGroup)

        # 下面两行表示可以使用通配符,如“窗前*月光”
        qp.remove_plugin_class(qparser.WildcardPlugin)
        qp.add_plugin(qparser.PrefixPlugin())

        # 随机数
        num = random.randint(3, 7)

        for word in words:
            q = qp.parse(u'%s' % word)

            # limit 表示多少条搜索结果
            results = s.search(q, limit=num)
            count = 0
            for i in results:
                if count > 0:  # 防止等于本身
                    xg_part.append((i['pid'], i['part']))
                count += 1
    return xg_part
Exemplo n.º 8
0
    def __init__(self, index_dir, var_path):
        self._index = None
        try:
            self._index = wh_index.open_dir(index_dir)
        except wh_index.IndexError:
            raise IndexError

        self._var_reader = self._make_var_reader(var_path)

        op = OperatorsPlugin(
            And=r"\bAND\b|&",
            Or=None,  # r"\bOR\b|\|",
            Not=r"\bNOT\b|\s+-",
            AndMaybe=None,
            Require=None,
        )
        parser = QueryParser("content",
                             _schema,
                             termclass=my_variations(self._var_reader))
        parser.remove_plugin_class(RangePlugin)
        parser.remove_plugin_class(BoostPlugin)
        parser.remove_plugin_class(WildcardPlugin)
        parser.replace_plugin(op)
        self._parser = parser

        parser_wild = QueryParser("content",
                                  _schema,
                                  termclass=my_variations(self._var_reader))
        parser_wild.remove_plugin_class(RangePlugin)
        parser_wild.remove_plugin_class(BoostPlugin)
        parser_wild.replace_plugin(op)
        self._parser_wild = parser_wild

        op_filter = OperatorsPlugin(And=r"\bAND\b",
                                    Or=r"\bOR\b",
                                    Not=None,
                                    AndMaybe=None,
                                    Require=None)
        asf_parser = QueryParser("asfilter", _schema)
        asf_parser.replace_plugin(op_filter)
        self._asf_parser = asf_parser
Exemplo n.º 9
0
# schema: a :class:`whoosh.fields.Schema` object to use when parsing. The appropriate fields in the schema will be used to
#     tokenize terms/phrases before they are turned into query objects.
#     You can specify None for the schema to create a parser that does not analyze the text of the query, usually for testing purposes.
parser = QueryParser("content", ix.schema)  # ix.schema 和 schema 是相同的东西
print(len(parser.plugins), parser.plugins)  # 11
# [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>,
#  <whoosh.qparser.plugins.FieldsPlugin>,     <whoosh.qparser.plugins.WildcardPlugin>,   <whoosh.qparser.plugins.PhrasePlugin>,
#  <whoosh.qparser.plugins.RangePlugin>,      <whoosh.qparser.plugins.GroupPlugin>,      <whoosh.qparser.plugins.OperatorsPlugin>,
#  <whoosh.qparser.plugins.BoostPlugin>,      <whoosh.qparser.plugins.EveryPlugin>]
## default_set(): Returns the default list of plugins to use.
print(len(parser.default_set()), parser.default_set())  # 10
# [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, <whoosh.qparser.plugins.FieldsPlugin>,
#  <whoosh.qparser.plugins.WildcardPlugin>,   <whoosh.qparser.plugins.PhrasePlugin>,      <whoosh.qparser.plugins.RangePlugin>,
#  <whoosh.qparser.plugins.GroupPlugin>,      <whoosh.qparser.plugins.OperatorsPlugin>,   <whoosh.qparser.plugins.BoostPlugin>,
#  <whoosh.qparser.plugins.EveryPlugin>]
parser.remove_plugin_class(whoosh.qparser.plugins.WildcardPlugin)
print(len(parser.plugins), len(parser.default_set()))  # 10 10
parser.add_plugin(qparser.PrefixPlugin)
print(len(parser.plugins), len(parser.default_set()))  # 11 10
## parse(text, normalize=True, debug=False) Parses the input string and returns a :class:`whoosh.query.Query` object/tree.
query = parser.parse('document')
## search(q, **kwargs) Runs a :class:`whoosh.query.Query` object on this searcher and returns a :class:`Results` object.
# See :doc:`/searching` for more information.
results = searcher.search(query)  # 检索 "content" 中出现 "document"
print(
    results
)  # <Top 1 Results for Term('content', 'document') runtime=0.0015511049998622184>
print(type(results))  # <class 'whoosh.searching.Results'>

## 查询方法二: 上面两行是只用方法, 下面一行也形
## find(defaultfield, querystring, **kwargs)
def predict_TF_IDF(data, docs_per_q):
    # index docs
    exclude = set(string.punctuation)

    res = []

    for idx, row in data.iterrows():
        print row["id"]
        # get answers words
        w_A = set(utils.tokenize(row["answerA"]))
        w_B = set(utils.tokenize(row["answerB"]))
        w_C = set(utils.tokenize(row["answerC"]))
        w_D = set(utils.tokenize(row["answerD"]))

        sc_A = 0
        sc_B = 0
        sc_C = 0
        sc_D = 0

        q_punc = row["question"]  # first thing to debug if not working
        question = "".join(ch for ch in q_punc if ch not in exclude)
        qp = QueryParser("content", schema=schema, group=qparser.OrGroup)
        qp.add_plugin(qparser.FuzzyTermPlugin())
        qp.remove_plugin_class(qparser.PhrasePlugin)
        qp.add_plugin(qparser.SequencePlugin())
        q = qp.parse(unicode(question, "utf-8"))
        # q = qp.parse('physics')
        # cp = qparser.CompoundsPlugin( AndMaybe="&~")
        with ix.searcher() as s, ix.searcher(weighting=scoring.TF_IDF()) as scoring_searcher_tfidf:
            results = s.search(q, limit=docs_per_q)
            """
            u_id = unicode(uuid.uuid1())
            if not os.path.exists("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id):
                os.mkdir("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id)
            q_ix = index.create_in("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id, schema)
            q_writer = q_ix.writer()
            for document in results:
                q_writer.add_document(article_title=document['article_title'], content=document['content'])
            q_writer.commit()
            """
            # with q_ix.searcher(weighting=scoring.TF_IDF()) as scoring_searcher_tfidf
            for document in results:
                doc_parser = QueryParser("content", schema=schema)
                doc_q = doc_parser.parse(u"article_title:%s" % document["article_title"])
                for w in w_A:
                    try:
                        sc_A += (
                            scoring.TF_IDF()
                            .scorer(scoring_searcher_tfidf, "content", w)
                            .score(doc_q.matcher(scoring_searcher_tfidf))
                        )
                    except TermNotFound:
                        pass
                for w in w_B:
                    try:
                        sc_B += (
                            scoring.TF_IDF()
                            .scorer(scoring_searcher_tfidf, "content", w)
                            .score(doc_q.matcher(scoring_searcher_tfidf))
                        )
                    except TermNotFound:
                        pass
                for w in w_C:
                    try:
                        sc_C += (
                            scoring.TF_IDF()
                            .scorer(scoring_searcher_tfidf, "content", w)
                            .score(doc_q.matcher(scoring_searcher_tfidf))
                        )
                    except TermNotFound:
                        pass
                for w in w_D:
                    try:
                        sc_D += (
                            scoring.TF_IDF()
                            .scorer(scoring_searcher_tfidf, "content", w)
                            .score(doc_q.matcher(scoring_searcher_tfidf))
                        )
                    except TermNotFound:
                        pass

        res.append(["A", "B", "C", "D"][np.argmax([sc_A, sc_B, sc_C, sc_D])])

    return res
Exemplo n.º 11
0
from whoosh.index import create_in
from whoosh.fields import *
schema = Schema(title=TEXT(stored=True), content=TEXT)
ix = create_in("indexdir", schema)
writer = ix.writer()
writer.add_document(title=u"First document", content=u"This is the first document we've added!")
writer.add_document(title=u"Second document", content=u"The second one is even more interesting!")
writer.add_document(title=u"Third document", content=u"letter first, stamp second, mail third")
writer.add_document(title=u"Fourth document", content=u"stamp first, mail third")
writer.add_document(title=u"Fivth document", content=u"letter first,  mail third")
writer.add_document(title=u"Sixth document", content=u"letters first, stamps second, mail third")
writer.add_document(title=u"Seventh document", content=u"stamp first, letters second, mial third")
writer.commit()


from whoosh.qparser import QueryParser, FuzzyTermPlugin, PhrasePlugin, SequencePlugin
with ix.searcher() as searcher:
    parser = QueryParser(u"content", ix.schema)
    parser.add_plugin(FuzzyTermPlugin())
    parser.remove_plugin_class(PhrasePlugin)
    parser.add_plugin(SequencePlugin())
    query = parser.parse(u"Apple iphone 6")
    print query
    results = searcher.search(query)
    print "nb of results =", len(results)
    for r in results:
        print r
Exemplo n.º 12
0
class QAModel:
    def __init__(self, index_path, index_name, model_config):
        self._index_path = None
        self._index_name = None
        self._model_config = None
        self._model = None
        self._whoosh_index = None
        self._query_parser = None

        self.index_path = index_path
        self.index_name = index_name
        self.model_config = model_config

        self.load_model()
        self.load_index()
        self.create_and_prep_query_parser()

    @property
    def index_path(self):
        return self._index_path

    @index_path.setter
    def index_path(self, value):
        if type(value) == str:
            self._index_path = value
        else:
            ValueError('Index path must be a string')

    @property
    def index_name(self):
        return self._index_name

    @index_name.setter
    def index_name(self, value):
        if type(value) == str:
            self._index_name = value
        else:
            ValueError('Index name must be a string')

    @property
    def model_config(self):
        return self._model_config

    @model_config.setter
    def model_config(self, value):
        if type(value) == str:
            self._model_config = value
        else:
            ValueError('Model config must be a string')

    @property
    def model(self):
        return self._model

    @model.setter
    def model(self, value):
        self._model = value

    @property
    def whoosh_index(self):
        return self._whoosh_index

    @whoosh_index.setter
    def whoosh_index(self, value):
        self._whoosh_index = value

    @property
    def query_parser(self):
        return self._query_parser

    @query_parser.setter
    def query_parser(self, value):
        self._query_parser = value

    def load_model(self):
        """
        Loads the model specified by model config
        """
        if self.model_config == 'squad.squad_bert':
            self.model = build_model(configs.squad.squad_bert, download=False)

    def load_index(self):
        """

        """
        self.whoosh_index = whoosh.index.open_dir(self.index_path,
                                                  indexname=self.index_name)

    def create_and_prep_query_parser(self):
        self.query_parser = QueryParser('text',
                                        schema=self.whoosh_index.schema,
                                        group=OrGroup)
        # Set up for natural language queries
        self.query_parser.remove_plugin_class(WildcardPlugin)

    def answer_question(self, input_query):
        """

        :param input_query:
        :return:
        """
        with self.whoosh_index.searcher() as searcher:
            parsed_query = self.query_parser.parse(input_query)

            search_results = searcher.search(parsed_query, limit=1)
            top_hit = [hit['text'] for hit in search_results][0]

            return self.model([top_hit], [input_query])
schema = Schema(title=TEXT(stored=True), content=TEXT)
ix = create_in("indexdir", schema)
writer = ix.writer()
writer.add_document(title=u"First document",
                    content=u"This is the first document we've added!")
writer.add_document(title=u"Second document",
                    content=u"The second one is even more interesting!")
writer.add_document(title=u"Third document",
                    content=u"letter first, stamp second, mail third")
writer.add_document(title=u"Fourth document",
                    content=u"stamp first, mail third")
writer.add_document(title=u"Fivth document",
                    content=u"letter first,  mail third")
writer.add_document(title=u"Sixth document",
                    content=u"letters first, stamps second, mail third")
writer.add_document(title=u"Seventh document",
                    content=u"stamp first, letters second, mial third")
writer.commit()

from whoosh.qparser import QueryParser, FuzzyTermPlugin, PhrasePlugin, SequencePlugin
with ix.searcher() as searcher:
    parser = QueryParser(u"content", ix.schema)
    parser.add_plugin(FuzzyTermPlugin())
    parser.remove_plugin_class(PhrasePlugin)
    parser.add_plugin(SequencePlugin())
    query = parser.parse(u"Apple iphone 6")
    print query
    results = searcher.search(query)
    print "nb of results =", len(results)
    for r in results:
        print r
Exemplo n.º 14
0
tmp_dir = TemporaryDirectory()

schema = Schema(title=TEXT(stored=True),
                path=ID(stored=True),
                content=TEXT(analyzer=RegexTokenizer(expression=rcompile(r"[\w/.]+"))))
ix = create_in(tmp_dir.name, schema)
writer = ix.writer()
writer.add_document(title=u"First document", path=u"/a",
                    content=u"this/is/a/test.html")
writer.add_document(title=u"Second document", path=u"/b",
                    content=u"this/is/a/hello.html   hello a yup")
writer.add_document(title=u"Second document", path=u"/b",
                    content=u"this is a hello.html   hello a yup")
writer.commit()
from whoosh.qparser import QueryParser

with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema)
    query.remove_plugin_class(qparser.PhrasePlugin)
    # query.add_plugin(qparser.SequencePlugin("[\w/.]+"))
    query = query.parse('this/is/a/test.html')
    print(query)
    results = searcher.search(query)
    print(results)
    print(results[0])
    # print(results[1])

if __name__ == '__main__':
    print('yup')
    # sleep(20)
Exemplo n.º 15
0
# schema: a :class:`whoosh.fields.Schema` object to use when parsing. The appropriate fields in the schema will be used to
#     tokenize terms/phrases before they are turned into query objects.
#     You can specify None for the schema to create a parser that does not analyze the text of the query, usually for testing purposes.
parser = QueryParser("content", ix.schema)  # ix.schema 和 schema 是相同的东西
print(len(parser.plugins), parser.plugins)  # 11
# [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>,
#  <whoosh.qparser.plugins.FieldsPlugin>,     <whoosh.qparser.plugins.WildcardPlugin>,   <whoosh.qparser.plugins.PhrasePlugin>,
#  <whoosh.qparser.plugins.RangePlugin>,      <whoosh.qparser.plugins.GroupPlugin>,      <whoosh.qparser.plugins.OperatorsPlugin>,
#  <whoosh.qparser.plugins.BoostPlugin>,      <whoosh.qparser.plugins.EveryPlugin>]
## default_set(): Returns the default list of plugins to use.
print(len(parser.default_set()), parser.default_set())  # 10
# [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, <whoosh.qparser.plugins.FieldsPlugin>,
#  <whoosh.qparser.plugins.WildcardPlugin>,   <whoosh.qparser.plugins.PhrasePlugin>,      <whoosh.qparser.plugins.RangePlugin>,
#  <whoosh.qparser.plugins.GroupPlugin>,      <whoosh.qparser.plugins.OperatorsPlugin>,   <whoosh.qparser.plugins.BoostPlugin>,
#  <whoosh.qparser.plugins.EveryPlugin>]
parser.remove_plugin_class(whoosh.qparser.plugins.WildcardPlugin)
print(len(parser.plugins), len(parser.default_set()))  # 10 10
parser.add_plugin(qparser.PrefixPlugin)
print(len(parser.plugins), len(parser.default_set()))  # 11 10
## parse(text, normalize=True, debug=False) Parses the input string and returns a :class:`whoosh.query.Query` object/tree.
query = parser.parse('document')
## search(q, **kwargs) Runs a :class:`whoosh.query.Query` object on this searcher and returns a :class:`Results` object.
# See :doc:`/searching` for more information.
results = searcher.search(query)  # 检索 "content" 中出现 "document"
print(results)  # <Top 1 Results for Term('content', 'document') runtime=0.0015511049998622184>
print(type(results))  # <class 'whoosh.searching.Results'>

## 查询方法二: 上面两行是只用方法, 下面一行也形
## find(defaultfield, querystring, **kwargs)
results = searcher.find("title", "document")  # 检索标题中出现 'document' 的文档
print(results)  # <Top 2 Results for Term('title', 'document') runtime=0.0008875329999682435>
Exemplo n.º 16
0
storage = FileStorage('index')  # idx_path 为索引路径
idx1 = storage.open_index(indexname='idx1')

from whoosh import index
# 方法二 使用open_dir函数
from whoosh.index import open_dir
idx2 = open_dir('index', indexname='idx2')  # indexname 为索引名
print(index.exists_in('index', indexname='idx2'))
pass

from whoosh.qparser import QueryParser, MultifieldParser, OrGroup, FieldsPlugin

og = OrGroup.factory(0.9)

qp = QueryParser("content", schema=idx1.schema)  # group=OrGroup
qp.remove_plugin_class(FieldsPlugin)
q = qp.parse("reset")
print(q)
# mqp = MultifieldParser(["title", "content"], schema=idx1.schema)
# mq = mqp.parse(u"many only")
#
# from whoosh.query import *
# myquery = And([Term("title", u"third"), q])
# # myquery = Term("title", u"ird")
# print(myquery)
searcher = idx1.searcher()
r = (searcher.search(q=q, limit=None))
print(len(r))
for hit in r:
    t = dict(hit)
    print(hit)