def test_gtlt(): schema = fields.Schema(a=fields.KEYWORD, b=fields.NUMERIC, c=fields.KEYWORD, d=fields.NUMERIC(float), e=fields.DATETIME) qp = qparser.QueryParser("a", schema) qp.add_plugin(plugins.GtLtPlugin()) qp.add_plugin(dateparse.DateParserPlugin()) q = qp.parse(u("a:hello b:>100 c:<=z there")) assert_equal(q.__class__, query.And) assert_equal(len(q), 4) assert_equal(q[0], query.Term("a", "hello")) assert_equal(q[1], query.NumericRange("b", 100, None, startexcl=True)) assert_equal(q[2], query.TermRange("c", None, 'z')) assert_equal(q[3], query.Term("a", "there")) q = qp.parse(u("hello e:>'29 mar 2001' there")) assert_equal(q.__class__, query.And) assert_equal(len(q), 3) assert_equal(q[0], query.Term("a", "hello")) # As of this writing, date ranges don't support startexcl/endexcl assert_equal(q[1], query.DateRange("e", datetime(2001, 3, 29, 0, 0), None)) assert_equal(q[2], query.Term("a", "there")) q = qp.parse(u("a:> alfa c:<= bravo")) assert_equal(text_type(q), "(a:a: AND a:alfa AND a:c: AND a:bravo)") qp.remove_plugin_class(plugins.FieldsPlugin) qp.remove_plugin_class(plugins.RangePlugin) q = qp.parse(u("hello a:>500 there")) assert_equal(text_type(q), "(a:hello AND a:a: AND a:500 AND a:there)")
def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): from whoosh import query from whoosh.qparser.common import QueryParserError try: if start is not None: start = self.from_text(self.to_text(start)) if end is not None: end = self.from_text(self.to_text(end)) except Exception: e = sys.exc_info()[1] raise QueryParserError(e) return query.NumericRange(fieldname, start, end, startexcl, endexcl, boost=boost)
def test_searching(): with make_index().searcher() as s: def _runq(q, result, **kwargs): r = s.search(q, **kwargs) assert_equal([d["id"] for d in r], result) _runq(query.Term("text", u("format")), ["format", "vector"]) _runq(query.Term("text", u("the")), ["fieldtype", "format", "const", "vector", "stored"]) _runq(query.Prefix("text", u("st")), ["format", "vector", "stored"]) _runq(query.Wildcard("id", u("*st*")), ["stored", "const"]) _runq(query.TermRange("id", u("c"), u("s")), ["fieldtype", "format", "const"]) _runq(query.NumericRange("subs", 10, 100), ["fieldtype", "format", "vector", "scorable"]) _runq(query.Phrase("text", ["this", "field"]), ["scorable", "unique", "stored"], limit=None) _runq(query.Every(), [ "fieldtype", "format", "vector", "scorable", "stored", "unique", "const" ]) _runq(query.Every("subs"), [ "fieldtype", "format", "vector", "scorable", "stored", "unique", "const" ])
def do_search(txt, sumlevel=None, kind=None, tries=0, limit=10, is_stem=None): txt = txt.replace(",", "") my_filter = None if kind and sumlevel: kf = query.Term("kind", kind) sf = query.Term("sumlevel", sumlevel) my_filter = query.And([kf, sf]) elif kind: my_filter = query.Term("kind", kind) elif sumlevel: my_filter = query.Term("sumlevel", sumlevel) if is_stem and is_stem > 0 and my_filter is not None: my_filter = my_filter & query.NumericRange("is_stem", 1, is_stem) elif is_stem and is_stem > 0 and my_filter is None: my_filter = query.NumericRange("is_stem", 1, is_stem) if tries > 2: return [], [], [] q = qp.parse(txt) weighter = SimpleWeighter(txt, B=.45, content_B=1.0, K1=1.5) with ix.searcher(weighting=weighter) as s: if len(txt) > 2: corrector = s.corrector("display") suggs = corrector.suggest(txt, limit=10, maxdist=2, prefix=3) else: suggs = [] results = s.search_page(q, 1, sortedby=[scores], pagelen=20, filter=my_filter) data = [[ r["id"], r["name"], r["zvalue"], r["kind"], r["display"], r["sumlevel"] if "sumlevel" in r else "", r["is_stem"] if "is_stem" in r else False, r["url_name"] if "url_name" in r else None ] for r in results] if not data and suggs: return do_search(suggs[0], sumlevel, kind, tries=tries + 1, limit=limit, is_stem=is_stem) return data, suggs, tries
def parse_query(self, fieldname, qstring, boost=1.0): from whoosh import query from whoosh.support.times import is_ambiguous at = self._parse_datestring(qstring) if is_ambiguous(at): startnum = datetime_to_long(at.floor()) endnum = datetime_to_long(at.ceil()) return query.NumericRange(fieldname, startnum, endnum) else: return query.Term(fieldname, self.to_text(at), boost=boost)
def test_numeric_ranges_unsigned(): values = [1, 10, 100, 1000, 2, 20, 200, 2000, 9, 90, 900, 9000] schema = fields.Schema(num2=fields.NUMERIC(stored=True, signed=False)) ix = RamStorage().create_index(schema) with ix.writer() as w: for v in values: w.add_document(num2=v) with ix.searcher() as s: q = query.NumericRange("num2", 55, None, True, False) r = s.search(q, limit=None) for hit in r: assert int(hit["num2"]) >= 55
def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): from whoosh import query if start is None and end is None: return query.Every(fieldname, boost=boost) if start is not None: startdt = self._parse_datestring(start).floor() start = datetime_to_long(startdt) if end is not None: enddt = self._parse_datestring(end).ceil() end = datetime_to_long(enddt) return query.NumericRange(fieldname, start, end, boost=boost)
def parse_query(self, fieldname, qstring, boost=1.0): from whoosh import query from whoosh.util.times import is_ambiguous try: at = self._parse_datestring(qstring) except: e = sys.exc_info()[1] return query.error_query(e) if is_ambiguous(at): startnum = datetime_to_long(at.floor()) endnum = datetime_to_long(at.ceil()) return query.NumericRange(fieldname, startnum, endnum) else: return query.Term(fieldname, at, boost=boost)
def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): from whoosh import query from whoosh.qparser.common import QueryParserError if start is not None: if not self.is_valid(start): raise QueryParserError("Range start %r is not a valid number" % start) start = self.prepare_number(start) if end is not None: if not self.is_valid(end): raise QueryParserError("Range end %r is not a valid number" % end) end = self.prepare_number(end) return query.NumericRange(fieldname, start, end, startexcl, endexcl, boost=boost)
def test_numeric_filter(): schema = fields.Schema(status=fields.NUMERIC, tags=fields.TEXT) ix = RamStorage().create_index(schema) # Add a single document with status = -2 with ix.writer() as w: w.add_document(status=-2, tags=u"alfa bravo") with ix.searcher() as s: # No document should match the filter fq = query.NumericRange("status", 0, 2) fr = s.search(fq) assert fr.scored_length() == 0 # Make sure the query would otherwise match q = query.Term("tags", u"alfa") r = s.search(q) assert r.scored_length() == 1 # Check the query doesn't match with the filter r = s.search(q, filter=fq) assert r.scored_length() == 0
endexcl, boost=1.0): from whoosh import query from whoosh.qparser import QueryParserError try: if start is not None: start = self.from_text(self.to_text(start)) if end is not None: end = self.from_text(self.to_text(end)) except Exception, e: raise QueryParserError(e) return query.NumericRange(fieldname, start, end, startexcl, endexcl, boost=boost) def sortable_values(self, ixreader, fieldname): from_text = self._from_text for text in ixreader.lexicon(fieldname): if text[0] != "\x00": # Only yield the full-precision values break yield (text, from_text(text)) class DATETIME(NUMERIC):
def do_search(txt, sumlevel=None, kind=None, tries=0, limit=10, is_stem=None, my_vars=None): txt = txt.replace(",", "") my_filter = None if kind and sumlevel: kf = query.Term("kind", kind) sf = query.Term("sumlevel", sumlevel) my_filter = query.And([kf, sf]) elif kind: my_filter = query.Term("kind", kind) elif sumlevel: my_filter = query.Term("sumlevel", sumlevel) if is_stem and is_stem > 0 and my_filter is not None: my_filter = my_filter & query.NumericRange("is_stem", 1, is_stem) elif is_stem and is_stem > 0 and my_filter is None: my_filter = query.NumericRange("is_stem", 1, is_stem) if tries > 2: return [], [], [], [] q = qp.parse(txt) rext = RegexTokenizer() var_txt = u" ".join([ stem(token.text) if len(token.text) > 3 else token.text for token in rext(unicode(txt)) ]) var_q = vars_qp.parse(var_txt) var_keywords = {} vars_max_score = None # search for variables in query if not my_vars: # my_vars can save original vars detected before autocorrecting for spelling, # so we'll only do var searches that haven't yet been autocorrected with vars_ix.searcher() as s: # s = vars_ix.searcher() results = s.search(var_q) # raise Exception(list(results)[0]) vscores = [r.score for r in results] vars_max_score = max(vscores) if vscores else None my_vars = [{ "matched_on": r.highlights("name"), "name": r["name"], "description": r["description"].split(","), "section": r["section"], "section_title": r["section_title"], "related_attrs": r["related_attrs"].split(","), "related_vars": r["related_vars"].split(","), "params": json.loads(r["params"]) if 'params' in r else None } for r in results] if my_vars: already_seen = [] filtered_my_vars = [] for my_var in my_vars: if my_var["related_vars"] not in already_seen: filtered_my_vars.append(my_var) already_seen.append(my_var["related_vars"]) highlight_txt = my_var["matched_on"] if highlight_txt: matches = re.findall(r'<b class="[^>]+">([^>]+)</b>', highlight_txt) if matches: for matched_txt in matches: var_keywords[matched_txt] = True my_vars = filtered_my_vars try: for term in q: for keyword in var_keywords.keys(): if term.text == 'in' and " in " in txt: term.boost = -1 elif term.text in keyword or keyword in term.text: term.boost = -0.5 except NotImplementedError: for keyword in var_keywords.keys(): if q.text == 'in' and " in " in txt: q.boost = -1 elif q.text in keyword or keyword in q.text: q.boost = -0.5 weighter = SimpleWeighter(txt, B=.45, content_B=1.0, K1=1.5) with ix.searcher(weighting=weighter) as s: if len(txt) > 2: corrector = s.corrector("display") suggs = corrector.suggest(txt, limit=10, maxdist=2, prefix=3) else: suggs = [] results = s.search_page(q, 1, sortedby=[scores], pagelen=20, filter=my_filter) data = [[ r["id"], r["name"], r["zvalue"], r["kind"], r["display"], r["sumlevel"] if "sumlevel" in r else "", r["is_stem"] if "is_stem" in r else False, r["url_name"] if "url_name" in r else None ] for r in results] if not data and suggs: return do_search(suggs[0], sumlevel, kind, tries=tries + 1, limit=limit, is_stem=is_stem, my_vars=my_vars) ascores = [r.score for r in results] attr_max_score = max(ascores) if ascores else 0 # raise Exception(attr_max_score, vars_max_score) # insert nationwide linkage data = nationwide_results(data, my_vars, attr_max_score, vars_max_score, txt) return data, suggs, tries, my_vars