def test_apply(): def visit(q): if isinstance(q, (Term, Variations, FuzzyTerm)): q.text = q.text.upper() return q return q.apply(visit) before = And([Not(Term("a", u("b"))), Variations("a", u("c")), Not(FuzzyTerm("a", u("d")))]) after = visit(before) assert_equal(after, And([Not(Term("a", u("B"))), Variations("a", u("C")), Not(FuzzyTerm("a", u("D")))])) def term2var(q): if isinstance(q, Term): return Variations(q.fieldname, q.text) else: return q.apply(term2var) q = And([Term("f", "alfa"), Or([Term("f", "bravo"), Not(Term("f", "charlie"))])]) q = term2var(q) assert_equal(q, And([Variations('f', 'alfa'), Or([Variations('f', 'bravo'), Not(Variations('f', 'charlie'))])]))
def queryCategoryGenerator(busqueda): trozos = busqueda query = None for p in trozos: if (query is None): query = FuzzyTerm("categoria", p, maxdist=2) else: query = query | FuzzyTerm("categoria", p, maxdist=2) return query
def querySearchGenerator(busqueda): trozos = busqueda.split(" ") query = None for p in trozos: if (query is None): query = FuzzyTerm("titulo", p, maxdist=int( len(p) / 4)) | FuzzyTerm( "descripcion", p, maxdist=int(len(p) / 4)) else: query = query | FuzzyTerm("titulo", p, maxdist=int( len(p) / 4)) | FuzzyTerm( "descripcion", p, maxdist=int(len(p) / 4)) return query
def busqueda_noticia(request): form = NoticiaBusquedaForm() noticias = None if request.method == 'POST': form = NoticiaBusquedaForm(request.POST) if form.is_valid(): noticias = Noticia.objects.all() keywords = form.cleaned_data['keywords'] ix = open_dir(dirindex) with ix.searcher() as searcher: temas = keywords.split() for x in temas: query = FuzzyTerm('titulo', x) #Si no funciona bien, hacerlo con Term noticias = noticias & searcher.search(query) else: form = NoticiaBusquedaForm() return render(request, 'busqueda_noticias.html', { 'form': form, 'noticias': noticias })
def search_doc(self, word, docTypes, numPage=1, numByPage=10, showNumResults=False): """ Return a list of docs that contains given word and that matches given type. """ indexSchema = IndexSchema() # Retrieves the fields to search from the doctypes schema fieldsToSearch = [] for docType in docTypes: docType = docType.lower() try: schema = indexSchema.doctypesSchema[docType] fieldsToSearch = fieldsToSearch + schema except: logger.warning("Schema not found for %s" % docType) # By default we search "content" (for BC) and "tags" fields = ['content', 'tags'] + fieldsToSearch logger.info("Search will be performed on fields %s" % fields) # Creates the query parser. # MultifieldParser allows search on multiple fields. # We use a custom FuzzyTerm class to set the Levenstein distance to 2 parser = MultifieldParser(fields, schema=indexSchema.schema, termclass=CustomFuzzyTerm) query = parser.parse(word) # Creates a filter on the doctype field doctypeFilterMatcher = [] for docType in docTypes: term = FuzzyTerm("docType", unicode(docType.lower()), 1.0, 2) doctypeFilterMatcher.append(term) docTypeFilter = Or(doctypeFilterMatcher) # Processes the search (request the index, Whoosh magic) with indexSchema.index.searcher() as searcher: results = searcher.search_page(query, numPage, pagelen=numByPage, filter=docTypeFilter) resultsID = [result["docId"] for result in results] logger.info("Results: %s" % resultsID) # Ensures BC if the number of results is not requested if showNumResults: return {'ids': resultsID, 'numResults': len(results)} else: return {'ids': resultsID}
def test_fuzzyterm(): schema = fields.Schema(id=fields.STORED, f=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, f=u("alfa bravo charlie delta")) w.add_document(id=2, f=u("bravo charlie delta echo")) w.add_document(id=3, f=u("charlie delta echo foxtrot")) w.add_document(id=4, f=u("delta echo foxtrot golf")) w.commit() with ix.searcher() as s: q = FuzzyTerm("f", "brave") assert_equal([d["id"] for d in s.search(q)], [1, 2])
def queryCategoryGenerator(busqueda): trozos = [] for b in busqueda: t = [] if " " in b: t = b.split(" ") trozos = trozos + t elif "/" in b: t = b.split("/") trozos = trozos + t trozos.append(b) query = None for p in trozos: if (query is None): query = FuzzyTerm("categoria", p, maxdist=2) else: query = query | FuzzyTerm("categoria", p, maxdist=2) return query
def test_query_copy_hash(): def do(q1, q2): q1a = copy.deepcopy(q1) assert_equal(q1, q1a) assert_equal(hash(q1), hash(q1a)) assert_not_equal(q1, q2) do(Term("a", u("b"), boost=1.1), Term("a", u("b"), boost=1.5)) do(And([Term("a", u("b")), Term("c", u("d"))], boost=1.1), And([Term("a", u("b")), Term("c", u("d"))], boost=1.5)) do(Or([Term("a", u("b"), boost=1.1), Term("c", u("d"))]), Or([Term("a", u("b"), boost=1.8), Term("c", u("d"))], boost=1.5)) do(DisjunctionMax([Term("a", u("b"), boost=1.8), Term("c", u("d"))]), DisjunctionMax([Term("a", u("b"), boost=1.1), Term("c", u("d"))], boost=1.5)) do(Not(Term("a", u("b"), boost=1.1)), Not(Term("a", u("b"), boost=1.5))) do(Prefix("a", u("b"), boost=1.1), Prefix("a", u("b"), boost=1.5)) do(Wildcard("a", u("b*x?"), boost=1.1), Wildcard("a", u("b*x?"), boost=1.5)) do(FuzzyTerm("a", u("b"), constantscore=True), FuzzyTerm("a", u("b"), constantscore=False)) do(FuzzyTerm("a", u("b"), boost=1.1), FuzzyTerm("a", u("b"), boost=1.5)) do(TermRange("a", u("b"), u("c")), TermRange("a", u("b"), u("d"))) do(TermRange("a", None, u("c")), TermRange("a", None, None)) do(TermRange("a", u("b"), u("c"), boost=1.1), TermRange("a", u("b"), u("c"), boost=1.5)) do(TermRange("a", u("b"), u("c"), constantscore=True), TermRange("a", u("b"), u("c"), constantscore=False)) do(NumericRange("a", 1, 5), NumericRange("a", 1, 6)) do(NumericRange("a", None, 5), NumericRange("a", None, None)) do(NumericRange("a", 3, 6, boost=1.1), NumericRange("a", 3, 6, boost=1.5)) do(NumericRange("a", 3, 6, constantscore=True), NumericRange("a", 3, 6, constantscore=False)) # do(DateRange) do(Variations("a", u("render")), Variations("a", u("renders"))) do(Variations("a", u("render"), boost=1.1), Variations("a", u("renders"), boost=1.5)) do(Phrase("a", [u("b"), u("c"), u("d")]), Phrase("a", [u("b"), u("c"), u("e")])) do(Phrase("a", [u("b"), u("c"), u("d")], boost=1.1), Phrase("a", [u("b"), u("c"), u("d")], boost=1.5)) do(Phrase("a", [u("b"), u("c"), u("d")], slop=1), Phrase("a", [u("b"), u("c"), u("d")], slop=2)) # do(Ordered) do(Every(), Every("a")) do(Every("a"), Every("b")) do(Every("a", boost=1.1), Every("a", boost=1.5)) do(NullQuery, Term("a", u("b"))) do(ConstantScoreQuery(Term("a", u("b"))), ConstantScoreQuery(Term("a", u("c")))) do(ConstantScoreQuery(Term("a", u("b")), score=2.0), ConstantScoreQuery(Term("a", u("c")), score=2.1)) do(Require(Term("a", u("b")), Term("c", u("d"))), Require(Term("a", u("b"), boost=1.1), Term("c", u("d")))) # do(Require) # do(AndMaybe) # do(AndNot) # do(Otherwise) do(SpanFirst(Term("a", u("b")), limit=1), SpanFirst(Term("a", u("b")), limit=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d"))), SpanNear(Term("a", u("b")), Term("c", u("e")))) do(SpanNear(Term("a", u("b")), Term("c", u("d")), slop=1), SpanNear(Term("a", u("b")), Term("c", u("d")), slop=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=1), SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=True), SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=False)) do(SpanNot(Term("a", u("b")), Term("a", u("c"))), SpanNot(Term("a", u("b")), Term("a", u("d")))) do(SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("d"))]), SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("e"))])) do(SpanContains(Term("a", u("b")), Term("a", u("c"))), SpanContains(Term("a", u("b")), Term("a", u("d"))))
def __init__(self, fieldname, text, boost=1.0, maxdist=1): FuzzyTerm.__init__(self, fieldname, text, 1.0, 2)
def search_doc(directory, word, doc_types, num_page=1, num_by_page=10, show_num_results=True): """ * -------------{Function}--------------- * Returns a list of docs that contains a given set of words that matches a g * -------------{returns}---------------- * {set} query results . . . * -------------{params}----------------- * : directory -> path of the index * : word -> words to query * : doc_types -> type of doc to search * : num_page -> number of pages to search * : show_num_results -> number of results to return """ index_schema = load_index(directory) doctypes_schema = load_doctypes_schema(directory) # Retrieves the fields to search from the doctypes schema fields_to_search = [] for doc_type in doc_types: doc_type = doc_type.lower() try: schema = doctypes_schema[doc_type] fields_to_search = fields_to_search + schema except: logger.warning( "Schema not found for {doc_type}".format(doc_type=doc_type)) # By default we search "content" (for BC) and "tags" fields = ["content", "tags"] + fields_to_search logger.info( "search will be performed on fields {fields}".format(fields=fields)) # Creates the query parser # MultifieldParser allows search on multiple fields # We use custom FuzzyTerm class to set the Leveshtein distance to 2 parser = MultifieldParser(fields, schema=doctypes_schema, termclass=CustomFuzzyTerm) query = parser.parse(word) # Creates a filter on the doctype field doctype_filter_matcher = [] for doc_type in doc_types: term = FuzzyTerm("doc_type", doc_type.lower(), 1.0, 2) doctype_filter_matcher.append(term) doc_type_filter = Or(doctype_filter_matcher) # Processes the search(request the index, whoosh magic) with index_schema.searcher() as searcher: results = searcher.search_page(query, num_page, pagelen=num_by_page, filter=doc_type_filter) results_id = [result["doc_id"] for result in results] logger.info("Results: {results_id}".format(results_id=results_id)) # Ensures BC if the number of results is not requested if show_num_results: return {"ids": results_id, "num_results": len(results)} else: return {"ids": results_id}
''' Created on Oct 27, 2014 @author: Cassie ''' from whoosh import index from whoosh.qparser import QueryParser from whoosh.query import And, Term, Not, FuzzyTerm, Phrase ix = index.open_dir("index") q1 = And([ Term("city_text", u"greek"), Term("city_text", u"roman"), Not(Term("city_text", u"persian")) ]) q2 = FuzzyTerm("city_text", u"shakespeare") q3 = Phrase("city_text", [u"located", u"below", u"sea", u"level"], slop=10) with ix.searcher() as s: results = s.search(q2, limit=None) for a in results: print a['city_name']
def fuzzy_term(q, dist, field): if len(q) <= 3: return Term(field, q) return FuzzyTerm(field, q, maxdist=dist, prefixlength=1)
def perform_search(self, sentence): with self._searcher() as s: tokens = sentence.split() tokens = [token for token in tokens if token != REPLACED] print('tokens=', tokens) exact_and_match = And([Term(TEXT_FIELD, t) for t in tokens], boost=.5) exact_or_match = Or([Term(TEXT_FIELD, t) for t in tokens], boost=.5, scale=0.9) # Added variability of maxdist based on word length fuzzy_or_match = Or([ FuzzyTerm(TEXT_FIELD, t, prefixlength=1, maxdist=1 if len(t) < 8 else 2) for t in tokens if len(t) >= 4 ], boost=.2, scale=0.9) if len(tokens) > 1: # add bigrams if there are any bigrams = ['_'.join(b) for b in find_ngrams(tokens, 2)] bigram_fuzzy_or_match = Or([ FuzzyTerm(BIGRAMS_FIELD, b, prefixlength=3, maxdist=2 if len(b) < 8 else 3) for b in bigrams ], scale=0.9) else: bigram_fuzzy_or_match = None non_brand_or_match = Or( [Term(NONBRAND_TEXT_FIELD, t) for t in tokens]) # q = exact_and_match \ # | exact_or_match \ # | fuzzy_or_match # my_match = Or([Term(f, token) for token in tokens], boost=1) # q = my_match # # q = Or([FuzzyTerm(f, token, prefixlength=2) for token in tokens if len(token) >= 3], boost=1.0, # scale=0.9) q = exact_and_match | exact_or_match | fuzzy_or_match | non_brand_or_match if bigram_fuzzy_or_match: q = q | bigram_fuzzy_or_match print(q) search_results = self.get_search_results(self._index, s, q) for x in search_results: print(x, x.score) if search_results: score, text, matched = search_results[0].items() return text, list(set(matched)) else: return None, None
def fuzzy_term(q: str, dist: int, field: str) -> Term: if len(q) <= 3: return Term(field, q) return FuzzyTerm(field, q, maxdist=dist, prefixlength=1)
def search(self, query): with self.index.searcher() as searcher: terms = [FuzzyTerm("content", word, maxdist=2) for word in query] search_query = Or(terms) results = searcher.search(search_query) return [result["filename"] for result in results]