Exemplo n.º 1
0
def test_gtlt():
    schema = fields.Schema(a=fields.KEYWORD, b=fields.NUMERIC,
                           c=fields.KEYWORD,
                           d=fields.NUMERIC(float), e=fields.DATETIME)
    qp = qparser.QueryParser("a", schema)
    qp.add_plugin(plugins.GtLtPlugin())
    qp.add_plugin(dateparse.DateParserPlugin())

    q = qp.parse(u("a:hello b:>100 c:<=z there"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 4)
    assert_equal(q[0], query.Term("a", "hello"))
    assert_equal(q[1], query.NumericRange("b", 100, None, startexcl=True))
    assert_equal(q[2], query.TermRange("c", None, 'z'))
    assert_equal(q[3], query.Term("a", "there"))

    q = qp.parse(u("hello e:>'29 mar 2001' there"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 3)
    assert_equal(q[0], query.Term("a", "hello"))
    # As of this writing, date ranges don't support startexcl/endexcl
    assert_equal(q[1], query.DateRange("e", datetime(2001, 3, 29, 0, 0), None))
    assert_equal(q[2], query.Term("a", "there"))

    q = qp.parse(u("a:> alfa c:<= bravo"))
    assert_equal(text_type(q), "(a:a: AND a:alfa AND a:c: AND a:bravo)")

    qp.remove_plugin_class(plugins.FieldsPlugin)
    qp.remove_plugin_class(plugins.RangePlugin)
    q = qp.parse(u("hello a:>500 there"))
    assert_equal(text_type(q), "(a:hello AND a:a: AND a:500 AND a:there)")
Exemplo n.º 2
0
    def parse_range(self,
                    fieldname,
                    start,
                    end,
                    startexcl,
                    endexcl,
                    boost=1.0):
        from whoosh import query
        from whoosh.qparser.common import QueryParserError

        try:
            if start is not None:
                start = self.from_text(self.to_text(start))
            if end is not None:
                end = self.from_text(self.to_text(end))
        except Exception:
            e = sys.exc_info()[1]
            raise QueryParserError(e)

        return query.NumericRange(fieldname,
                                  start,
                                  end,
                                  startexcl,
                                  endexcl,
                                  boost=boost)
Exemplo n.º 3
0
def test_searching():
    with make_index().searcher() as s:

        def _runq(q, result, **kwargs):
            r = s.search(q, **kwargs)
            assert_equal([d["id"] for d in r], result)

        _runq(query.Term("text", u("format")), ["format", "vector"])
        _runq(query.Term("text", u("the")),
              ["fieldtype", "format", "const", "vector", "stored"])
        _runq(query.Prefix("text", u("st")), ["format", "vector", "stored"])
        _runq(query.Wildcard("id", u("*st*")), ["stored", "const"])
        _runq(query.TermRange("id", u("c"), u("s")),
              ["fieldtype", "format", "const"])
        _runq(query.NumericRange("subs", 10, 100),
              ["fieldtype", "format", "vector", "scorable"])
        _runq(query.Phrase("text", ["this", "field"]),
              ["scorable", "unique", "stored"],
              limit=None)
        _runq(query.Every(), [
            "fieldtype", "format", "vector", "scorable", "stored", "unique",
            "const"
        ])
        _runq(query.Every("subs"), [
            "fieldtype", "format", "vector", "scorable", "stored", "unique",
            "const"
        ])
Exemplo n.º 4
0
def do_search(txt, sumlevel=None, kind=None, tries=0, limit=10, is_stem=None):
    txt = txt.replace(",", "")

    my_filter = None

    if kind and sumlevel:
        kf = query.Term("kind", kind)
        sf = query.Term("sumlevel", sumlevel)
        my_filter = query.And([kf, sf])
    elif kind:
        my_filter = query.Term("kind", kind)
    elif sumlevel:
        my_filter = query.Term("sumlevel", sumlevel)
    if is_stem and is_stem > 0 and my_filter is not None:
        my_filter = my_filter & query.NumericRange("is_stem", 1, is_stem)
    elif is_stem and is_stem > 0 and my_filter is None:
        my_filter = query.NumericRange("is_stem", 1, is_stem)

    if tries > 2:
        return [], [], []
    q = qp.parse(txt)
    weighter = SimpleWeighter(txt, B=.45, content_B=1.0, K1=1.5)
    with ix.searcher(weighting=weighter) as s:
        if len(txt) > 2:
            corrector = s.corrector("display")
            suggs = corrector.suggest(txt, limit=10, maxdist=2, prefix=3)
        else:
            suggs = []
        results = s.search_page(q,
                                1,
                                sortedby=[scores],
                                pagelen=20,
                                filter=my_filter)
        data = [[
            r["id"], r["name"], r["zvalue"], r["kind"], r["display"],
            r["sumlevel"] if "sumlevel" in r else "",
            r["is_stem"] if "is_stem" in r else False,
            r["url_name"] if "url_name" in r else None
        ] for r in results]
        if not data and suggs:
            return do_search(suggs[0],
                             sumlevel,
                             kind,
                             tries=tries + 1,
                             limit=limit,
                             is_stem=is_stem)
        return data, suggs, tries
Exemplo n.º 5
0
Arquivo: fields.py Projeto: oier/Yaki
    def parse_query(self, fieldname, qstring, boost=1.0):
        from whoosh import query
        from whoosh.support.times import is_ambiguous

        at = self._parse_datestring(qstring)
        if is_ambiguous(at):
            startnum = datetime_to_long(at.floor())
            endnum = datetime_to_long(at.ceil())
            return query.NumericRange(fieldname, startnum, endnum)
        else:
            return query.Term(fieldname, self.to_text(at), boost=boost)
Exemplo n.º 6
0
def test_numeric_ranges_unsigned():
    values = [1, 10, 100, 1000, 2, 20, 200, 2000, 9, 90, 900, 9000]
    schema = fields.Schema(num2=fields.NUMERIC(stored=True, signed=False))

    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        for v in values:
            w.add_document(num2=v)

    with ix.searcher() as s:
        q = query.NumericRange("num2", 55, None, True, False)
        r = s.search(q, limit=None)
        for hit in r:
            assert int(hit["num2"]) >= 55
Exemplo n.º 7
0
    def parse_range(self, fieldname, start, end, startexcl, endexcl,
                    boost=1.0):
        from whoosh import query

        if start is None and end is None:
            return query.Every(fieldname, boost=boost)

        if start is not None:
            startdt = self._parse_datestring(start).floor()
            start = datetime_to_long(startdt)

        if end is not None:
            enddt = self._parse_datestring(end).ceil()
            end = datetime_to_long(enddt)

        return query.NumericRange(fieldname, start, end, boost=boost)
Exemplo n.º 8
0
    def parse_query(self, fieldname, qstring, boost=1.0):
        from whoosh import query
        from whoosh.util.times import is_ambiguous

        try:
            at = self._parse_datestring(qstring)
        except:
            e = sys.exc_info()[1]
            return query.error_query(e)

        if is_ambiguous(at):
            startnum = datetime_to_long(at.floor())
            endnum = datetime_to_long(at.ceil())
            return query.NumericRange(fieldname, startnum, endnum)
        else:
            return query.Term(fieldname, at, boost=boost)
Exemplo n.º 9
0
    def parse_range(self, fieldname, start, end, startexcl, endexcl,
                    boost=1.0):
        from whoosh import query
        from whoosh.qparser.common import QueryParserError

        if start is not None:
            if not self.is_valid(start):
                raise QueryParserError("Range start %r is not a valid number"
                                       % start)
            start = self.prepare_number(start)
        if end is not None:
            if not self.is_valid(end):
                raise QueryParserError("Range end %r is not a valid number"
                                       % end)
            end = self.prepare_number(end)
        return query.NumericRange(fieldname, start, end, startexcl, endexcl,
                                  boost=boost)
def test_numeric_filter():
    schema = fields.Schema(status=fields.NUMERIC, tags=fields.TEXT)
    ix = RamStorage().create_index(schema)

    # Add a single document with status = -2
    with ix.writer() as w:
        w.add_document(status=-2, tags=u"alfa bravo")

    with ix.searcher() as s:
        # No document should match the filter
        fq = query.NumericRange("status", 0, 2)
        fr = s.search(fq)
        assert fr.scored_length() == 0

        # Make sure the query would otherwise match
        q = query.Term("tags", u"alfa")
        r = s.search(q)
        assert r.scored_length() == 1

        # Check the query doesn't match with the filter
        r = s.search(q, filter=fq)
        assert r.scored_length() == 0
Exemplo n.º 11
0
Arquivo: fields.py Projeto: oier/Yaki
                    endexcl,
                    boost=1.0):
        from whoosh import query
        from whoosh.qparser import QueryParserError

        try:
            if start is not None:
                start = self.from_text(self.to_text(start))
            if end is not None:
                end = self.from_text(self.to_text(end))
        except Exception, e:
            raise QueryParserError(e)

        return query.NumericRange(fieldname,
                                  start,
                                  end,
                                  startexcl,
                                  endexcl,
                                  boost=boost)

    def sortable_values(self, ixreader, fieldname):
        from_text = self._from_text

        for text in ixreader.lexicon(fieldname):
            if text[0] != "\x00":
                # Only yield the full-precision values
                break

            yield (text, from_text(text))


class DATETIME(NUMERIC):
Exemplo n.º 12
0
def do_search(txt,
              sumlevel=None,
              kind=None,
              tries=0,
              limit=10,
              is_stem=None,
              my_vars=None):
    txt = txt.replace(",", "")

    my_filter = None

    if kind and sumlevel:
        kf = query.Term("kind", kind)
        sf = query.Term("sumlevel", sumlevel)
        my_filter = query.And([kf, sf])
    elif kind:
        my_filter = query.Term("kind", kind)
    elif sumlevel:
        my_filter = query.Term("sumlevel", sumlevel)
    if is_stem and is_stem > 0 and my_filter is not None:
        my_filter = my_filter & query.NumericRange("is_stem", 1, is_stem)
    elif is_stem and is_stem > 0 and my_filter is None:
        my_filter = query.NumericRange("is_stem", 1, is_stem)

    if tries > 2:
        return [], [], [], []
    q = qp.parse(txt)

    rext = RegexTokenizer()
    var_txt = u" ".join([
        stem(token.text) if len(token.text) > 3 else token.text
        for token in rext(unicode(txt))
    ])

    var_q = vars_qp.parse(var_txt)
    var_keywords = {}
    vars_max_score = None
    # search for variables in query
    if not my_vars:
        # my_vars can save original vars detected before autocorrecting for spelling,
        # so we'll only do var searches that haven't yet been autocorrected
        with vars_ix.searcher() as s:
            # s = vars_ix.searcher()
            results = s.search(var_q)
            # raise Exception(list(results)[0])
            vscores = [r.score for r in results]
            vars_max_score = max(vscores) if vscores else None

            my_vars = [{
                "matched_on":
                r.highlights("name"),
                "name":
                r["name"],
                "description":
                r["description"].split(","),
                "section":
                r["section"],
                "section_title":
                r["section_title"],
                "related_attrs":
                r["related_attrs"].split(","),
                "related_vars":
                r["related_vars"].split(","),
                "params":
                json.loads(r["params"]) if 'params' in r else None
            } for r in results]
        if my_vars:
            already_seen = []
            filtered_my_vars = []
            for my_var in my_vars:
                if my_var["related_vars"] not in already_seen:
                    filtered_my_vars.append(my_var)
                already_seen.append(my_var["related_vars"])
                highlight_txt = my_var["matched_on"]

                if highlight_txt:
                    matches = re.findall(r'<b class="[^>]+">([^>]+)</b>',
                                         highlight_txt)
                    if matches:
                        for matched_txt in matches:
                            var_keywords[matched_txt] = True
            my_vars = filtered_my_vars

    try:
        for term in q:
            for keyword in var_keywords.keys():
                if term.text == 'in' and " in " in txt:
                    term.boost = -1
                elif term.text in keyword or keyword in term.text:
                    term.boost = -0.5
    except NotImplementedError:
        for keyword in var_keywords.keys():
            if q.text == 'in' and " in " in txt:
                q.boost = -1
            elif q.text in keyword or keyword in q.text:
                q.boost = -0.5

    weighter = SimpleWeighter(txt, B=.45, content_B=1.0, K1=1.5)
    with ix.searcher(weighting=weighter) as s:
        if len(txt) > 2:
            corrector = s.corrector("display")
            suggs = corrector.suggest(txt, limit=10, maxdist=2, prefix=3)
        else:
            suggs = []
        results = s.search_page(q,
                                1,
                                sortedby=[scores],
                                pagelen=20,
                                filter=my_filter)
        data = [[
            r["id"], r["name"], r["zvalue"], r["kind"], r["display"],
            r["sumlevel"] if "sumlevel" in r else "",
            r["is_stem"] if "is_stem" in r else False,
            r["url_name"] if "url_name" in r else None
        ] for r in results]

        if not data and suggs:
            return do_search(suggs[0],
                             sumlevel,
                             kind,
                             tries=tries + 1,
                             limit=limit,
                             is_stem=is_stem,
                             my_vars=my_vars)

        ascores = [r.score for r in results]
        attr_max_score = max(ascores) if ascores else 0
        # raise Exception(attr_max_score, vars_max_score)
        # insert nationwide linkage
        data = nationwide_results(data, my_vars, attr_max_score,
                                  vars_max_score, txt)

        return data, suggs, tries, my_vars