Пример #1
0
def search_all(username, field_name="content", key_word="分布式", **kwargs):
    WHOOSH_PATH = '/home/python/Learn/django/Haystack-Whoosh/whoosh_index/%s' % username
    if not os.path.exists(WHOOSH_PATH):
        return []

    # if not id and not keyword and not title and not content:
    #     return []
    index = whoosh_open_idx(WHOOSH_PATH, WHOOSH_SCHEMA)
    searcher = index.searcher()

    field_name = "content"
    key_word = "分布式"

    args = {
        "limit": None,
    }
    if "sortedby" in kwargs:
        sortedby = kwargs.pop("sortedby")
        if "orderby" in kwargs:
            orderby = kwargs.pop("orderby")
        else:
            orderby = "desc"
        if orderby == "desc":
            facet = sorting.FieldFacet(sortedby, reverse=True)
        else:
            facet = sorting.FieldFacet(sortedby)
        args["sortedby"] = facet
    args.update(kwargs)
    return searcher.find(field_name, key_word, **args)
Пример #2
0
def query_search(indexdir, queries, n=10, function='BM25F'):
    ix = index.open_dir(indexdir)
    search_fields = ['resname', 'categories', 'address', 'city',
                     'state']  # search fields
    og = qparser.OrGroup.factory(0.9)
    qp = MultifieldParser(search_fields,
                          ix.schema,
                          termclass=query.Variations,
                          group=og)
    qp.add_plugin(DateParserPlugin(free=True))
    q = qp.parse(queries)
    result_index = []
    if function == 'BM25F':
        with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s:
            rates = sorting.FieldFacet('rating', reverse=True)
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=n, sortedby=[scores, rates])
            k = min(len(results), n)
            for i in range(k):
                result_index.append(int(results[i]['ID']))
    if function == 'TF_IDF':
        with ix.searcher(weighting=scoring.TF_IDF()) as s:
            rates = sorting.FieldFacet('rating', reverse=True)
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=n, sortedby=[scores, rates])
            k = min(len(results), n)
            for i in range(k):
                result_index.append(int(results[i]['ID']))
    return result_index
Пример #3
0
    def _paginated_search_mails(self, query, window, page):
        page = int(page) if page is not None and int(page) > 1 else 1
        window = int(window) if window is not None else 25

        with self._index.searcher() as searcher:
            tags_facet = sorting.FieldFacet('tag', allow_overlap=True, maptype=sorting.Count)
            sorting_facet = sorting.FieldFacet('date', reverse=True)
            results = searcher.search_page(query, page, pagelen=window, groupedby=tags_facet, sortedby=sorting_facet)
            return unique([mail['ident'] for mail in results]), sum(results.results.groups().values())
Пример #4
0
class test_translate():
    domain = [
        ("alfa", 100, 50),
        ("bravo", 20, 80),
        ("charlie", 10, 10),
        ("delta", 82, 39),
        ("echo", 20, 73),
        ("foxtrot", 81, 59),
        ("golf", 39, 93),
        ("hotel", 57, 48),
        ("india", 84, 75),
    ]

    schema = fields.Schema(name=fields.TEXT(sortable=True),
                           a=fields.NUMERIC(sortable=True),
                           b=fields.NUMERIC(sortable=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        for name, a, b in domain:
            w.add_document(name=u(name), a=a, b=b)

    with ix.searcher() as s:
        q = query.Every()

        # Baseline: just sort by a field
        r = s.search(q, sortedby="a")
        assert " ".join([
            hit["name"] for hit in r
        ]) == "charlie bravo echo golf hotel foxtrot delta india alfa"

        # Sort by reversed name
        target = [x[0] for x in sorted(domain, key=lambda x: x[0][::-1])]
        tf = sorting.TranslateFacet(lambda name: name[::-1],
                                    sorting.FieldFacet("name"))
        r = s.search(q, sortedby=tf)
        assert [hit["name"] for hit in r] == target

        # Sort by average of a and b
        def avg(a, b):
            return (a + b) / 2

        target = [
            x[0] for x in sorted(domain, key=lambda x: (x[1] + x[2]) / 2)
        ]
        af = sorting.FieldFacet("a")
        bf = sorting.FieldFacet("b")
        tf = sorting.TranslateFacet(avg, af, bf)
        r = s.search(q, sortedby=tf)
        assert [hit["name"] for hit in r] == target
Пример #5
0
def search2(username,
            id=None,
            keyword=None,
            title=None,
            content=None,
            sortedby="id",
            orderby="desc",
            page=1,
            page_size=10):
    # 可以每个用户一个目录
    WHOOSH_PATH = '/home/python/Learn/django/Haystack-Whoosh/whoosh_index/%s' % username
    if not os.path.exists(WHOOSH_PATH):
        return []

    # if not id and not keyword and not title and not content:
    #     return []
    index = whoosh_open_idx(WHOOSH_PATH, WHOOSH_SCHEMA)
    searcher = index.searcher()

    print("-----------2")
    parser = QueryParser("content", index.schema)
    myquery = parser.parse("分布式")
    facet = sorting.FieldFacet("id", reverse=True)  # 按序排列搜索结果
    results = searcher.search(
        myquery, limit=None, sortedby=facet)  # limit为搜索结果的限制,默认为10,详见博客开头的官方文档
    for result1 in results:
        print(dict(result1))
    print("-----------2")
Пример #6
0
def test_nocachefield_segments():
    schema = fields.Schema(a=fields.ID(stored=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(a=u("bravo"))
    w.add_document(a=u("echo"))
    w.add_document(a=u("juliet"))
    w.commit()
    w = ix.writer()
    w.add_document(a=u("kilo"))
    w.add_document(a=u("foxtrot"))
    w.add_document(a=u("charlie"))
    w.commit(merge=False)
    w = ix.writer()
    w.delete_by_term("a", u("echo"))
    w.add_document(a=u("alfa"))
    w.add_document(a=u("india"))
    w.add_document(a=u("delta"))
    w.commit(merge=False)

    with ix.searcher() as s:
        q = query.TermRange("a", u("bravo"), u("k"))
        facet = sorting.FieldFacet("a", reverse=True)

        r = s.search(q, sortedby=facet)
        assert [hit["a"] for hit in r] == [
            "juliet", "india", "foxtrot", "delta", "charlie", "bravo"
        ]

        mq = query.Or(
            [query.Term("a", u("bravo")),
             query.Term("a", u("delta"))])
        anq = query.AndNot(q, mq)
        r = s.search(anq, sortedby=facet)
        assert [hit["a"]
                for hit in r] == ["juliet", "india", "foxtrot", "charlie"]

        mq = query.Or(
            [query.Term("a", u("bravo")),
             query.Term("a", u("delta"))])
        r = s.search(q, mask=mq, sortedby=facet)
        assert [hit["a"]
                for hit in r] == ["juliet", "india", "foxtrot", "charlie"]

        fq = query.Or([
            query.Term("a", u("alfa")),
            query.Term("a", u("charlie")),
            query.Term("a", u("echo")),
            query.Term("a", u("india")),
        ])
        r = s.search(query.Every(), filter=fq, sortedby=facet)
        assert [hit["a"] for hit in r] == ["india", "charlie", "alfa"]

        nq = query.Not(
            query.Or([query.Term("a", u("alfa")),
                      query.Term("a", u("india"))]))
        r = s.search(query.Every(), filter=nq, sortedby=facet)
        assert [hit["a"] for hit in r] == [
            "kilo", "juliet", "foxtrot", "delta", "charlie", "bravo"
        ]
Пример #7
0
    def test(ix):
        with ix.searcher() as s:
            # Sort by title
            r = s.search(query.Every(), sortedby="title")
            assert [hit["title"] for hit in r] == sorted_titles

            # Sort by reverse title
            facet = sorting.FieldFacet("title", reverse=True)
            r = s.search(query.Every(), sortedby=facet)
            assert [hit["title"] for hit in r] == list(reversed(sorted_titles))

            # Sort by num (-10 to 10) first, and within that, by reverse title
            facet = sorting.MultiFacet()
            facet.add_field("num")
            facet.add_field("title", reverse=True)

            r = s.search(query.Every(), sortedby=facet)
            target = ["Visual and Statistical Thinking",
                      "Cognitive Style of Powerpoint",
                      "Beautiful Evidence",
                      "Visual Explanations",
                      "Visual Display of Quantitative Information, The",
                      "Envisioning Information",
                      ]
            assert [hit["title"] for hit in r] == target
Пример #8
0
def searchuser():
    q = request.args.get("q", "")  # repr
    offset = int(request.args.get("offset", 0))
    count = int(request.args.get("count", 20))
    with uix.searcher() as searcher:
        query = QueryParser("nickname", uix.schema).parse(
            "nickname:*%s*" %
            q)  # QueryParser("name", ix.schema).parse("tash*")
        #print query
        user_id = sorting.FieldFacet("user_id", reverse=True)
        results = searcher.search_page(query,
                                       max(offset / count, 0) + 1,
                                       pagelen=count,
                                       sortedby=user_id)
        print results.offset, count, offset, max(offset / count, 0) + 1
        if results.offset < offset:
            return "[]"
        tmp = hashlib.md5(str(mktime(datetime.datetime.now().timetuple()))
                          ).hexdigest() + "user_search_tmp"
        lua = """local searched = loadstring('return ' .. KEYS[1])()
for i = 1, table.getn(searched) do
    redis.call('sadd', KEYS[2], tostring(searched[i]))
end
local arr = redis.call('sort', KEYS[2], 'GET', 'users:*')
return arr"""
        res = rs.eval(
            lua, 2,
            '{' + ','.join(str(hit['user_id']) for hit in results) + '}', tmp)
        rs.delete(tmp)
        r = ",".join(res)
    return "[" + r + "]"
Пример #9
0
def test_overlapping_lists():
    schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD)
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(id=0, tags=u("alfa bravo charlie"))
        w.add_document(id=1, tags=u("bravo charlie delta"))
        w.add_document(id=2, tags=u("charlie delta echo"))
        w.add_document(id=3, tags=u("delta echo alfa"))
        w.add_document(id=4, tags=u("echo alfa bravo"))

    with ix.searcher() as s:
        of = sorting.FieldFacet("tags", allow_overlap=True)
        cat = of.categorizer(s)
        assert not cat._use_vectors

        r = s.search(query.Every(), groupedby={"tags": of})
        assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4],
                                    'charlie': [0, 1, 2], 'delta': [1, 2, 3],
                                    'echo': [2, 3, 4]}

        fcts = sorting.Facets()
        fcts.add_field("tags", allow_overlap=True)
        r = s.search(query.Every(), groupedby=fcts)
        assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4],
                                    'charlie': [0, 1, 2], 'delta': [1, 2, 3],
                                    'echo': [2, 3, 4]}
Пример #10
0
 def find_unique_orgid(self, q, limit):
     facet = sorting.FieldFacet("id", reverse=True)
     jobs = self.ix.searcher().search(q,
                                      collapse="orgid",
                                      sortedby=facet,
                                      limit=limit)
     return jobs
Пример #11
0
def test_sorting():
    from whoosh import sorting

    schema = fields.Schema(id=fields.STORED,
                           name=fields.ID(stored=True),
                           size=fields.NUMERIC)
    ix = RamIndex(schema)

    with ix.writer() as w:
        w.add_document(id=0, name=u("bravo"), size=10)
        w.add_document(id=1, name=u("alfa"), size=9)
        w.add_document(id=2, name=u("delta"), size=8)
        w.add_document(id=3, name=u("charlie"), size=7)

    with ix.searcher() as s:
        q = query.Every()
        r = s.search(q, sortedby="name")
        assert_equal([hit["id"] for hit in r], [1, 0, 3, 2])

        r = s.search(q, sortedby="size")
        assert_equal([hit["id"] for hit in r], [3, 2, 1, 0])

        facet = sorting.FieldFacet("size", reverse=True)
        r = s.search(q, sortedby=facet)
        assert_equal([hit["id"] for hit in r], [0, 1, 2, 3])
Пример #12
0
 def search(self, parameter):
     # 提取查询字段,创建检索器
     keys = parameter['keys']
     parser = None
     if len(keys) == 1:
         parser = QueryParser(keys[0], schema=self.index.schema)
     elif len(keys) > 1:
         parser = MultifieldParser(keys, schema=self.index.schema)
     # 搜索参数(排序、分页)
     # score = sorting.ScoreFacet() # 相关度
     id = sorting.FieldFacet('id', reverse=False)  # 标题字段
     _limit = None  # 分页限制
     if 'page' in parameter and 'pagesize' in parameter:
         page = parameter['page']
         pagesize = parameter['pagesize']
         if page > 0 and pagesize != 0:
             _limit = page * pagesize
     # 执行搜索
     query = parser.parse(parameter['keywords'])
     result = self.searcher.search(query, limit=_limit, sortedby=[id])
     # 返回结果
     res = list()
     for hit in result:
         res.append({
             'title':
             hit['title'],
             'url':
             hit['url'],
             'content':
             re.sub(r'<[^>]+>', ' | ', hit.highlights('content'), re.S)
         })
     return res
Пример #13
0
def test_add_sortable():
    st = RamStorage()
    schema = fields.Schema(chapter=fields.ID(stored=True), price=fields.NUMERIC)
    ix = st.create_index(schema)
    with ix.writer() as w:
        w.add_document(chapter=u("alfa"), price=100)
        w.add_document(chapter=u("bravo"), price=200)
        w.add_document(chapter=u("charlie"), price=300)
        w.add_document(chapter=u("delta"), price=400)
    with ix.writer() as w:
        w.add_document(chapter=u("bravo"), price=500)
        w.add_document(chapter=u("alfa"), price=600)
        w.add_document(chapter=u("delta"), price=100)
        w.add_document(chapter=u("charlie"), price=200)
        w.merge = False

    with ix.reader() as r:
        assert not r.has_column("chapter")
        assert not r.has_column("price")

    with ix.writer() as w:
        sorting.add_sortable(w, "chapter", sorting.StoredFieldFacet("chapter"))
        sorting.add_sortable(w, "price", sorting.FieldFacet("price"))
        w.schema.test = 100

    with ix.reader() as r:
        assert r.has_column("chapter")
        assert r.has_column("price")

        chapr = r.column_reader("chapter")
        pricer = r.column_reader("price")
        assert chapr[0] == "alfa"
        assert pricer[0] == 100
Пример #14
0
def bounced_addresses_filter(searcher, contacts):
    query = QueryParser('bounced', searcher.schema).parse('*')
    bounced_addresses = searcher.search(
        query,
        limit=None,
        groupedby=sorting.FieldFacet('bounced', allow_overlap=True)).groups()
    return set(contacts) - set(flatten([bounced_addresses]))
Пример #15
0
    def search(self, query_string, page="1", limit=20):
        results = []
        query_string = unicode(query_string, 'utf-8')
        with self.index.searcher() as searcher:
            query = QueryParser("content",
                                self.index.schema).parse(query_string)

            scores = sorting.ScoreFacet()
            sortperson = sorting.FieldFacet("person")
            sortcollection = sorting.FieldFacet("collection", reverse=True)

            resultset = searcher.search_page(
                query,
                int(page),
                pagelen=int(limit),
                sortedby=[sortcollection, scores, sortperson])
            # NOTE: Need to copy plain dicts out, since once the searcher
            #   dies (end of with block), the Hit results lose their reference to
            #   the data.
            for hit in resultset[0:]:
                # Grab a copy of the results as a plain dict.
                result = hit.fields()

                # Also grab the surrounding fragment as a highlight.
                # NOTE: This is pretty much the only point we know
                #   "where" in the matched document the hit occurs.
                #   The raw content we indexed is stored in 'content',
                #   so we tell the Hit instance to pull the surrounding
                #   text fragments from there.
                # Also:
                #   These highlights are pretty much the only reason
                #   we need to bother stashing the entire document.
                #   Otherwise, the index can be even smaller.
                #   Whoosh allows to hunt for the content in the
                #   original files, if they're available.  But as our
                #   text content isn't large -- keeping it in the
                #   index seems faster.
                result['highlights'] = hit.highlights('content')
                results.append(result)

            results = {
                'matches': results,
                'matches_returned': resultset.scored_length(),
                'total_matches': len(resultset),
                'query': query_string
            }
        return results
Пример #16
0
 def _search_all_mails(self, query):
     with self._index.searcher() as searcher:
         sorting_facet = sorting.FieldFacet('date', reverse=True)
         results = searcher.search(query,
                                   sortedby=sorting_facet,
                                   reverse=True,
                                   limit=None)
         return unique([mail['ident'] for mail in results])
Пример #17
0
def test_reverse_collapse():
    from whoosh import sorting

    schema = fields.Schema(title=fields.TEXT(stored=True),
                           content=fields.TEXT,
                           path=fields.ID(stored=True),
                           tags=fields.KEYWORD,
                           order=fields.NUMERIC(stored=True))

    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(title=u"First document",
                       content=u"This is my document!",
                       path=u"/a",
                       tags=u"first",
                       order=20.0)
        w.add_document(title=u"Second document",
                       content=u"This is the second example.",
                       path=u"/b",
                       tags=u"second",
                       order=12.0)
        w.add_document(title=u"Third document",
                       content=u"Examples are many.",
                       path=u"/c",
                       tags=u"third",
                       order=15.0)
        w.add_document(title=u"Thirdish document",
                       content=u"Examples are too many.",
                       path=u"/d",
                       tags=u"third",
                       order=25.0)

    with ix.searcher() as s:
        q = query.Every('content')
        r = s.search(q)
        assert [hit["path"] for hit in r] == ["/a", "/b", "/c", "/d"]

        q = query.Or([
            query.Term("title", "document"),
            query.Term("content", "document"),
            query.Term("tags", "document")
        ])
        cf = sorting.FieldFacet("tags")
        of = sorting.FieldFacet("order", reverse=True)
        r = s.search(q, collapse=cf, collapse_order=of, terms=True)
        assert [hit["path"] for hit in r] == ["/a", "/b", "/d"]
Пример #18
0
def autocomplete(query_str, results=10):
    query_str = u' '.join([
        t.text for t in _analyzer(query_str)
        if not 'university'.startswith(t.text)
    ])

    q = _query_parser.parse(query_str)
    return [
        _ror_rows[row['ror']] for row in _searcher.search_page(
            q,
            1,
            results,
            sortedby=[
                sorting.FieldFacet('citation_score', reverse=True),
                sorting.FieldFacet('num_students', reverse=True),
                sorting.ScoreFacet(),
            ])
    ]
Пример #19
0
def search(term):
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(term)
        date_sort_facet = sorting.FieldFacet("date", reverse=True)
        results = searcher.search(query, sortedby=date_sort_facet)
        links = []
        for x in results:
            links.append('<li><a href="' + x['path'] + '.html' + '">' +
                         x['title'] + '</a></li>\n')
        return u"".join(links).encode('utf8')
Пример #20
0
def search_index(query):
    # sizes = sorting.FieldFacet("size")
    # prices = sorting.FieldFacet("price", reverse=True)
    # results = searcher.search(myquery, sortedby=[sizes, prices])
    lec_ids = sorting.FieldFacet("lec_id")
    agreeCounts = sorting.FieldFacet("agreeCount", reverse=True)
    marks = sorting.FieldFacet("mark", reverse=True)
    ix = open_dir("indexdir")
    with ix.searcher() as searcher:
        # query = MultifieldParser(["url", "title", "tags", "note", "article"], ix.schema).parse("使用")
        parser = QueryParser("content", ix.schema)
        myquery = parser.parse(query)
        # 一开始这里失败了,是由于txt文件的编码形式不是UTF-8,导致了乱码。
        # results = searcher.search(myquery,limit=None)
        # results = searcher.search_page(myquery, 5)
        results = searcher.search(myquery,
                                  limit=None,
                                  sortedby=[lec_ids, agreeCounts, marks])
        print(len(results))
        print(type(results))
        # print(results[:])
        # for i in range(len(results)):
        #     print(results[i]);print('\n') #IndexError: results[10]: Results only has 10 hits
        print(results)
        count = 0
        client = pymongo.MongoClient("mongodb://address")
        db_opt = client["course_info"]
        b = list()
        for i in results:
            count += 1
            print(i)
            j = dict(i)
            print(j)
            db_opt.result.insert_one({
                'lec_id': j['lec_id'],
                'agreeCount': j['agreeCount'],
                'mark': j['mark'],
                'content': j['content']
            })
            b.append(j['lec_id'])
        seta = set(b)
        print(count)
        return seta
Пример #21
0
    def _search_tag_groups(self, is_filtering_tags):
        seen = None
        query_parser = QueryParser('tag', self._index.schema)
        options = {'limit': None, 'groupedby': sorting.FieldFacet('tag', allow_overlap=True), 'maptype': sorting.Count}

        with self._index.searcher() as searcher:
            total = searcher.search(query_parser.parse('*'), **options).groups()
            if not is_filtering_tags:
                seen = searcher.search(query_parser.parse("* AND flags:%s" % Status.SEEN), **options).groups()
        return seen, total
Пример #22
0
    def contacts(self, query):
        restrict_q = Term("tag", "drafts") | Term("tag", "trash")

        if query:
            to = QueryParser('to', self._index.schema)
            cc = QueryParser('cc', self._index.schema)
            bcc = QueryParser('bcc', self._index.schema)
            sender = QueryParser('sender', self._index.schema)
            with self._index.searcher() as searcher:
                to = searcher.search(to.parse("*%s*" % query), limit=None, mask=restrict_q,
                                     groupedby=sorting.FieldFacet('to', allow_overlap=True)).groups()
                cc = searcher.search(cc.parse("*%s*" % query), limit=None, mask=restrict_q,
                                     groupedby=sorting.FieldFacet('cc', allow_overlap=True)).groups()
                bcc = searcher.search(bcc.parse("*%s*" % query), limit=None, mask=restrict_q,
                                      groupedby=sorting.FieldFacet('bcc', allow_overlap=True)).groups()
                sender = searcher.search(sender.parse("*%s*" % query), limit=None, mask=restrict_q,
                                         groupedby=sorting.FieldFacet('sender', allow_overlap=True)).groups()
                return flatten([to, cc, bcc, sender])

        return []
Пример #23
0
def test_compound_sort():
    fspec = fields.KEYWORD(stored=True, sortable=True)
    schema = fields.Schema(a=fspec, b=fspec, c=fspec)
    ix = RamStorage().create_index(schema)

    alist = u("alfa bravo alfa bravo alfa bravo alfa bravo alfa bravo").split()
    blist = u("alfa bravo charlie alfa bravo charlie alfa bravo charlie alfa"
              ).split()
    clist = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet"
              ).split()
    assert all(len(ls) == 10 for ls in (alist, blist, clist))

    with ix.writer() as w:
        for i in xrange(10):
            w.add_document(a=alist[i], b=blist[i], c=clist[i])

    with ix.searcher() as s:
        q = query.Every()
        sortedby = [
            sorting.FieldFacet("a"),
            sorting.FieldFacet("b", reverse=True),
            sorting.FieldFacet("c")
        ]

        r = s.search(q, sortedby=sortedby)
        output = []
        for hit in r:
            output.append(" ".join((hit["a"], hit["b"], hit["c"])))

        assert output == [
            "alfa charlie charlie",
            "alfa charlie india",
            "alfa bravo echo",
            "alfa alfa alfa",
            "alfa alfa golf",
            "bravo charlie foxtrot",
            "bravo bravo bravo",
            "bravo bravo hotel",
            "bravo alfa delta",
            "bravo alfa juliet",
        ]
Пример #24
0
def search_addresses(searcher, query):
    restrict_q = Term("tag", "drafts") | Term("tag", "trash")
    results = []
    for field in ['to', 'cc', 'bcc', 'sender']:
        query_parser = QueryParser(field, searcher.schema)
        results.append(
            searcher.search(query_parser.parse("*%s*" % query),
                            limit=None,
                            mask=restrict_q,
                            groupedby=sorting.FieldFacet(
                                field, allow_overlap=True)).groups())
    return flatten(results)
Пример #25
0
def search(q, filters, query_string, max_facets=5):
    """ Search for a query term and a set o filters
        Returns a list of hits and the representation of the facets
    """
    ix = get_or_create_index()
    hits = []
    facets = [
        sorting.FieldFacet("tags", allow_overlap=True, maptype=sorting.Count)
    ]
    tags = Tag.objects.values(
        'slug',
        'title',
    )
    parser = qparser.QueryParser("text", schema=ix.schema)  # , group=og)
    try:
        q = parser.parse("*" + q + "*")
    except:
        q = None
    if q or filters:
        searcher = ix.searcher()
        for filter_value in filters:
            filter_name = "tags"
            q = q & query.Term(filter_name, filter_value)
        hits = searcher.search(q.normalize(), groupedby=facets)
        active_facets = []
        sorted_facets = sorted(hits.groups("tags").items(),
                               key=operator.itemgetter(1, 0),
                               reverse=True)
        facets = []
        for facet_slug, facet_value in sorted_facets:
            if not facet_slug:
                continue
            qs = query_string.copy()
            qs["page"] = "1"
            if facet_slug in filters:
                qs.setlist('f', [f for f in filters if f != facet_slug])
                state = "active"
            else:
                qs.appendlist('f', facet_slug)
                state = "available"
            obj = tags.get(slug=facet_slug)
            facet_dict = {
                'slug': facet_slug,
                'title': obj.get("title", ""),
                'count': facet_value,
                'qs': qs.urlencode(),
            }

            if state == 'active':
                active_facets.append(facet_dict)
            else:
                facets.append(facet_dict)
        return {"hits": hits, "facets": facets, "active_facets": active_facets}
Пример #26
0
    def contacts(self, query):
        if query:
            to = QueryParser('to', self._index.schema)
            cc = QueryParser('cc', self._index.schema)
            bcc = QueryParser('bcc', self._index.schema)
            with self._index.searcher() as searcher:
                to = searcher.search(to.parse("*%s*" % query),
                                     limit=None,
                                     groupedby=sorting.FieldFacet(
                                         'to', allow_overlap=True)).groups()
                cc = searcher.search(cc.parse("*%s*" % query),
                                     limit=None,
                                     groupedby=sorting.FieldFacet(
                                         'cc', allow_overlap=True)).groups()
                bcc = searcher.search(bcc.parse("*%s*" % query),
                                      limit=None,
                                      groupedby=sorting.FieldFacet(
                                          'bcc', allow_overlap=True)).groups()
                return flatten([to, cc, bcc])

        return []
Пример #27
0
 def search(self, text: str, sorted_by_count=False, fieldname='body'):
     """returns (dset, item_gen)
     results.docs(): a set of docnum of the results in index
     results.items(): a generator for (docnum, score) in from highest score/count to lowest
     """
     isearcher = self._searcher
     skw = {'limit': None}
     skw['q'] = QueryParser("body", self.ix.schema).parse(text)
     if sorted_by_count:
         skw['sortedby'] = sorting.FieldFacet('count', reverse=True)
     results = isearcher.search(**skw)
     return results.docs(), results.items()
Пример #28
0
def search(humanReadableId):
    query = request.args.get('q', '').strip()
    pagination = None
    if query:
        index_base_dir = config().get_path("ZIM", "wikipedia_index_dir")
        index_dir = os.path.join(index_base_dir, humanReadableId)
        page = int(request.args.get('page', 1))

        # Load index so we can query it for which fields exist
        ix = whoosh_open_dir_32_or_64(index_dir)

        # Set a higher value for the title field so it is weighted more
        weighting = scoring.BM25F(title_B=1.0)

        # Sort pages with "Image:" in their title after
        # regular articles
        def image_pages_last(searcher, docnum):
            fields = searcher.stored_fields(docnum)
            if fields['title'].find("Image:") == 0:
                return 1
            else:
                return 0

        # Support older whoosh indexes that do not have a reverse_links field
        if 'reverse_links' in ix.schema.names():
            sortedby = sorting.MultiFacet([
                sorting.FunctionFacet(image_pages_last),
                sorting.ScoreFacet(),
                sorting.FieldFacet("reverse_links", reverse=True),
            ])
        else:
            sortedby = sorting.MultiFacet([
                sorting.FunctionFacet(image_pages_last),
                sorting.ScoreFacet(),
            ])

        (pagination, suggestion) = paginated_search(ix, ["title", "content"],
                                                    query,
                                                    page,
                                                    weighting=weighting,
                                                    sort_column=sortedby)
    else:
        flash(_('Please input keyword(s)'), 'error')

    return render_template('zim/search.html',
                           humanReadableId=humanReadableId,
                           pagination=pagination,
                           suggestion=suggestion,
                           keywords=query,
                           endpoint_desc=EndPointDescription(
                               'zim_views.search',
                               {'humanReadableId': humanReadableId}))
Пример #29
0
def test_group_types():
    schema = fields.Schema(a=fields.STORED, b=fields.TEXT, c=fields.ID)
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(a=0, b=u("blah"), c=u("apple"))
        w.add_document(a=1, b=u("blah blah"), c=u("bear"))
        w.add_document(a=2, b=u("blah blah blah"), c=u("apple"))
        w.add_document(a=3, b=u("blah blah blah blah"), c=u("bear"))
        w.add_document(a=4, b=u("blah blah blah blah blah"), c=u("apple"))
        w.add_document(a=5, b=u("blah blah blah blah blah blah"), c=u("bear"))
        w.add_document(a=6,
                       b=u("blah blah blah blah blah blah blah"),
                       c=u("apple"))

    with ix.searcher() as s:
        q = query.Term("b", "blah")

        f = sorting.FieldFacet("c", maptype=sorting.UnorderedList)
        r = s.search(q, groupedby=f)
        gs = r.groups("c")
        assert_equal(gs["apple"], [0, 2, 4, 6])
        assert_equal(gs["bear"], [1, 3, 5])

        f = sorting.FieldFacet("c", maptype=sorting.Count)
        r = s.search(q, groupedby=f)
        gs = r.groups("c")
        assert_equal(gs["apple"], 4)
        assert_equal(gs["bear"], 3)

        f = sorting.FieldFacet("c", maptype=sorting.Best)
        r = s.search(q, groupedby=f)
        gs = r.groups()
        assert_equal(gs["apple"], 6)
        assert_equal(gs["bear"], 5)

        r = s.search(q, groupedby="c", maptype=sorting.Count)
        gs = r.groups()
        assert_equal(gs["apple"], 4)
        assert_equal(gs["bear"], 3)
Пример #30
0
    def get_context(self):
        if self.context:
            return self.context

        self.get_index()

        from whoosh.qparser.dateparse import DateParserPlugin

        datetimes = sorting.FieldFacet("datetime", reverse=True)
        parser = qparser.QueryParser("text", self.index.schema)
        query = parser.parse("*")
        searcher = self.index.searcher()

        restrict_replies = whoosh.query.Term("reply", True)

        results = searcher.search(
            query, mask=restrict_replies, sortedby=datetimes, limit=MAX_TWEETS_NUMBER)

        click.secho("Adding to context...", fg="green")

        context = set()

        for result in results:
            text = result["text"]

            click.secho(text)

            import re

            # don't want these influence context
            # drop usernames
            text = re.sub(r"@[^ ]+", " ", text)
            # drop links
            text = re.sub(r"http(s?)://[^ ]+", " ", text)
            # drop cut-off text from the end of manual rts
            text = re.sub(r"[^ ]+…$", " ", text)

            context.update(text.split(" "))

        # split and discard empty strings
        context = " ".join(filter(None, context))

        click.secho("Processed context:", fg="green")
        click.secho(context)

        self.context = context

        return self.context