def search(self, text, scope=None, limit=20): """Search from the current index Args: text (str): String to search for scope (str, optional): Scope to limit the search. Defaults to None. limit (int, optional): Limit number of search results. Defaults to 20. Returns: [List(_dict)]: Search results """ ix = self.get_index() results = None out = [] with ix.searcher() as searcher: parser = MultifieldParser(["title", "content"], ix.schema) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(text) filter_scoped = None if scope: filter_scoped = Prefix(self.id, scope) results = searcher.search(query, limit=limit, filter=filter_scoped) for r in results: out.append(self.parse_result(r)) return out
def generic(idx, qs=None, q=None, limit=5, parser=None, page=1): if qs is q is None: raise ValueError('cannot have a null querystring and query') if parser is None: parser = MultifieldParser( ['title', 'keywords', 'summary', 'content', 'author'], idx.schema, group=OrGroup) # add better date parsing support parser.add_plugin(DateParserPlugin()) parser.remove_plugin_class(WildcardPlugin) with idx.searcher() as search: # generate the Query object if qs: query = parser.parse(qs) else: query = q facet = MultiFacet() facet.add_score() facet.add_field('modified', reverse=True) facet.add_field('title') results = search.search_page(query, pagenum=page, sortedby=facet, pagelen=limit) res = clean_results(idx, results, query) # pagination attributes on `search_page` method res.page_number = results.pagenum # current page number res.page_total = results.pagecount # total pages in results res.offset = results.offset # first result of current page res.pagelen = results.pagelen # the number of max results per page return res
def search(index_name, text, scope=None, limit=20): index_dir = get_index_path(index_name) ix = open_dir(index_dir) results = None out = [] with ix.searcher() as searcher: parser = MultifieldParser(["title", "content"], ix.schema) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(text) filter_scoped = None if scope: filter_scoped = Prefix("path", scope) results = searcher.search(query, limit=limit, filter=filter_scoped) for r in results: title_highlights = r.highlights("title") content_highlights = r.highlights("content") out.append( frappe._dict( title=r["title"], path=r["path"], title_highlights=title_highlights, content_highlights=content_highlights, )) return out
def search(querystring, language_code): ix = LanguageIndex(settings.WHOOSH_INDEX_PATH, language_code, _get_schema()).load() # parser = QueryParser('content', ix.schema) parser = MultifieldParser( ['title', 'keywords', 'content'], ix.schema) # fieldboosts={'title':5, 'keywords':4, 'content':1}) parser.remove_plugin_class( WildcardPlugin) # remove unused feature for better performance query = parser.parse(querystring) # print(parser, query, querystring) result = { 'results': [], } with ix.searcher() as searcher: results = searcher.search(query) # print(results) # import pdb; pdb.set_trace() # collect results for hit in results: my_hit = {} # my_hit['pos'] = hit.pos # my_hit['rank'] = hit.rank # my_hit['docnum'] = hit.docnum my_hit['score'] = hit.score my_hit['object'] = Article.objects.get(code=hit.fields()['code']) #.exclude(published=False).exclude(release_date__gte=datetime.today()) # my_hit['object']['is_visible'] = True result['results'].append(my_hit) # print(hit.pos, hit.rank, hit.docnum, hit.score, hit) return result
def search(querystring, language_code): ix = LanguageIndex(settings.WHOOSH_INDEX_PATH, language_code, _get_schema()).load() # parser = QueryParser('content', ix.schema) parser = MultifieldParser(['title', 'keywords', 'content'], ix.schema) # fieldboosts={'title':5, 'keywords':4, 'content':1}) parser.remove_plugin_class(WildcardPlugin) # remove unused feature for better performance query = parser.parse(querystring) # print(parser, query, querystring) result = { 'results': [], } with ix.searcher() as searcher: results = searcher.search(query) # print(results) # import pdb; pdb.set_trace() # collect results for hit in results: my_hit = {} # my_hit['pos'] = hit.pos # my_hit['rank'] = hit.rank # my_hit['docnum'] = hit.docnum my_hit['score'] = hit.score my_hit['object'] = Article.objects.get(code=hit.fields()['code']) #.exclude(published=False).exclude(release_date__gte=datetime.today()) # my_hit['object']['is_visible'] = True result['results'].append(my_hit) # print(hit.pos, hit.rank, hit.docnum, hit.score, hit) return result
def getSpecialCasesResults(self, value, translation): """Takes in a query and compares it to hard-coded special cases. The special cases are for the "Miracle Letters" :param translation: The requested translation type :type translation: str :return: A list of ayah matches if there is a match, otherwise returns None :rtype: list, None """ matchingAyahList = [] for case in SPECIAL_CASES: if case[0] == value: value = case[1] matchingAyahList = case[2] if len(matchingAyahList) > 0: allowedResults = [] for matchingAyah in matchingAyahList: allowedResults.append( "surah_num:" + str(matchingAyah[0]) + " AND ayah_num:" + str( matchingAyah[1])) parser = MultifieldParser(["surah_num", "ayah_num"], self._ix.schema) parser.remove_plugin_class(PhrasePlugin) parser.add_plugin(SequencePlugin()) query = parser.parse(" OR ".join(allowedResults)) with self._ix.searcher() as searcher: results = searcher.search(query, limit=7) return self._getResponseObjectFromParams( value, self._getMatchesFromResults(results, translation), [], [] ) else: return None
def search(self, text, scope=None, limit=20): """Search from the current index Args: text (str): String to search for scope (str, optional): Scope to limit the search. Defaults to None. limit (int, optional): Limit number of search results. Defaults to 20. Returns: [List(_dict)]: Search results """ ix = self.get_index() results = None out = [] search_fields = self.get_fields_to_search() fieldboosts = {} # apply reducing boost on fields based on order. 1.0, 0.5, 0.33 and so on for idx, field in enumerate(search_fields, start=1): fieldboosts[field] = 1.0 / idx with ix.searcher() as searcher: parser = MultifieldParser(search_fields, ix.schema, termclass=FuzzyTermExtended, fieldboosts=fieldboosts) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(text) filter_scoped = None if scope: filter_scoped = Prefix(self.id, scope) results = searcher.search(query, limit=limit, filter=filter_scoped) for r in results: out.append(self.parse_result(r)) return out
def search(self, queries, fuzzy = True, default_fields = [], max_results = None): if type(queries) != list: queries = [queries] if type(default_fields) != list: default_fields = [default_fields] if fuzzy and len(queries) == 1 and len(queries[0].split()) == 1 and ':' not in queries[0] and '*' not in queries[0]: queries = ['*%s*' % (queries[0])] for query in queries: if type(query) != unicode: query = query.decode('utf-8') log.msg('search query: %s' % (query)) with self.ix.searcher() as searcher: parser = MultifieldParser(default_fields, self.ix.schema) parser.remove_plugin_class(plugins.WildcardPlugin) parser.add_plugin(WildcardPlugin) query = parser.parse(query) log.msg('search query parsed: %s' % (query)) results = searcher.search(query, limit = None) count = 0 for result in results: yield result['oid'] count += 1 if max_results and count >= max_results: break
writer = ix.writer() _string = sys.argv[1] _mode = sys.argv[2] normal = (_mode == "normal") _distance = 0 if (normal is False): _distance = int(sys.argv[3]) with ix.searcher() as searcher: # og = qparser.OrGroup.factory(0.9) parser = MultifieldParser(["title", "sub_title", "author", "content"], schema=ix.schema) # parser = qparser.QueryParser("content", ix.schema) parser.remove_plugin_class(qparser.PhrasePlugin) parser.add_plugin(qparser.SequencePlugin()) if (normal): string = _string query = parser.parse(string) else: # proximity distance = _distance proximty_query = "\"" + _string + "\"" + '~' + str((1 + distance) * 3) query = parser.parse(proximty_query) # sys.stdout.buffer.write(query) sys.stdout.buffer.write(">>>>>>OUTPUT start<<<<<<".encode('utf-8')) results = searcher.search(query, limit=20) results.fragmenter.maxchars = 100
def get_result(self, value: str, translation: str, limit: Optional[int] = None) -> Dict: """ Checks to see if there are any special case results then searches for an Ayah with Whoosh if no special cases exist. :param value: The ayah text. :param translation: The type of translation. :param limit: The max limit on the number of queries to return. :return: A dictionary with query text, matches, matched terms, and suggestions. """ # Start with special cases value = self.adjust_for_special_cases(value) # Otherwise execute full search with self._ix.searcher() as searcher: # Check if its just one word is_single_word_query = False if len(value.split()) == 1: parser = MultifieldParser( ["simple_ayah", "roots", "decomposed_ayah"], self._ix.schema) is_single_word_query = True else: parser = QueryParser("simple_ayah", self._ix.schema) # Execute search on a single word parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(value) results = searcher.search(query, limit=limit) if results: final_matches = self._get_matches_from_results( results, translation) return self._get_response_object_from_params( value, final_matches, value.split(' '), []) if not is_single_word_query: results = self._parse_and_search(searcher, 'simple_ayah', value, limit, True) if not results: results = self._parse_and_search(searcher, 'roots', value, limit, True) if not results: results = self._parse_and_search( searcher, 'decomposed_ayah', value, limit, True) if results: matched_terms = results.matched_terms() first_results = None if len(matched_terms) > 1 and results.scored_length() > 1: if results[1].score > 10: first_results = results results = self._parse_and_search( searcher, 'simple_ayah', results[0]['simple_ayah'], limit) final_matches = self._get_matches_from_results( results, translation) suggestions = [] if first_results: for result in [ fR for fR in first_results if fR.score > 10 ]: suggestions.append(result['simple_ayah']) return self._get_response_object_from_params( value, final_matches, # `term` is a tuple where the second index contains the matching term. [term[1].decode('utf-8') for term in matched_terms], suggestions) return self._get_empty_response(value)
import whoosh.index from whoosh.qparser import MultifieldParser, OrGroup, WildcardPlugin whoosh_idx = whoosh.index.open_dir("whoosh_idx", indexname="nasdaq") query_parser = MultifieldParser(["title", "article"], schema=whoosh_idx.schema, group=OrGroup) query_parser.remove_plugin_class(WildcardPlugin) parsed_query = query_parser.parse("What market does FitBit compete in?") with whoosh_idx.searcher() as searcher: search_results = searcher.search(parsed_query, limit=1) [print(sr["title"]) for sr in search_results]
if ALWAYS_REBUILD: index = rebuild_index() else: index = open_dir(search_path) #TODO: Doesnt work. Only triggers once. #timer = threading.Timer(86400, index.optimize) #timer.daemon = True #timer.start() writer = AsyncWriter(index, 0.5) parser = MultifieldParser(["content", "title"], schema, { "content": 1.0, "title": 2.0 }) #parser.add_plugin(SingleQuotePlugin()) parser.remove_plugin_class( FieldsPlugin ) #https://whoosh.readthedocs.io/en/latest/api/qparser.html#plug-ins def search(q): if len(q) < min_search_length: return [] result = [] with index.searcher() as searcher: search_result = searcher.search(parser.parse(q)) #result = list(map(lambda x: x.fields(), search_result)) result = [item.fields() for item in search_result] return result
def getResult(self, value, translation): specialCasesResults = self.getSpecialCasesResults(value, translation) if specialCasesResults: return specialCasesResults with self._ix.searcher() as searcher: isSingleWordQuery = False if len(value.split()) == 1: parser = MultifieldParser( ["simple_ayah", "roots", "decomposed_ayah"], self._ix.schema ) isSingleWordQuery = True else: parser = QueryParser("simple_ayah", self._ix.schema) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(value) results = searcher.search(query, limit=None) if results: finalMatches = self._getMatchesFromResults(results, translation) return self._getResponseObjectFromParams( value, finalMatches, value.split(' '), [] ) if not isSingleWordQuery: parser = QueryParser("simple_ayah", self._ix.schema, group=OrGroup) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(value) results = searcher.search(query, terms=True, limit=None) if not results: parser = QueryParser("roots", self._ix.schema, group=OrGroup) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(value) results = searcher.search(query, terms=True, limit=None) if not results: parser = QueryParser("decomposed_ayah", self._ix.schema, group=OrGroup) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(value) results = searcher.search(query, terms=True, limit=None) if results: matchedTerms = results.matched_terms() firstResults = None if len(matchedTerms) > 1 and results.scored_length() > 1: if results[1].score > 10: firstResults = results parser = QueryParser("simple_ayah", self._ix.schema) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(results[0]["simple_ayah"]) results = searcher.search(query, limit=None) finalMatches = self._getMatchesFromResults(results, translation) suggestions = [] if firstResults: for result in [fR for fR in firstResults if fR.score > 10]: suggestions.append(result['simple_ayah']) return self._getResponseObjectFromParams( value, finalMatches, # term is a tuple where the second index contains the matching # term [term[1] for term in matchedTerms], suggestions ) return self._getEmptyResponse(value)
from whoosh.qparser import QueryParser, MultifieldParser, FieldsPlugin analyzer = NgramAnalyzer(3) schema = Schema( id=STORED, category=TEXT(field_boost=3.0), #title = TEXT(analyzer, False) title=NGRAMWORDS(2, 20, False, 2.0)) index = create_in("search", schema) #index = open_dir("search") writer = index.writer() writer.add_document(id=0, title="Test Words") writer.add_document(id=1, title="Apple Banana Cucumber") writer.add_document(id=2, title="Deck Elevator Floor", category="test") writer.add_document(id=3, title="Pen Pineapple Apple Pen") writer.commit() #parser = QueryParser("title", schema) parser = MultifieldParser(["category", "title"], schema, { "category": 3.0, "title": 2.0 }) parser.remove_plugin_class(FieldsPlugin) with index.searcher() as searcher: result = searcher.search(parser.parse("Test")) print(str(len(result)) + " results") for f in result: print(f["id"])