def find(criteria, reindex=False): """ Search for Azure CLI commands :param str criteria: Query text to search for. :param bool reindex: Clear the current index and reindex the command modules. :return: :rtype: None """ if reindex: _create_index() ix = _get_index() qp = MultifieldParser( ['cmd_name', 'short_summary', 'long_summary', 'examples'], schema=schema ) if 'OR' in criteria or 'AND' in criteria: # looks more advanced, let's trust them to make a great query q = qp.parse(" ".join(criteria)) else: # let's help out with some OR's to provide a less restrictive search q = qp.parse(" OR ".join(criteria)) with ix.searcher() as searcher: results = searcher.search(q) results.fragmenter = ContextFragmenter(maxchars=300, surround=200) results.formatter = UppercaseFormatter() for hit in results: _print_hit(hit)
def find(cmd, criteria, reindex=False): from whoosh.qparser import MultifieldParser if reindex: _create_index(cmd.cli_ctx) try: ix = _get_index(cmd.cli_ctx) except ValueError: # got a pickle error because the index was written by a different python version # recreate the index and proceed _create_index(cmd.cli_ctx) ix = _get_index(cmd.cli_ctx) qp = MultifieldParser( ['cmd_name', 'short_summary', 'long_summary', 'examples'], schema=_get_schema()) if 'OR' in criteria or 'AND' in criteria: # looks more advanced, let's trust them to make a great query q = qp.parse(" ".join(criteria)) else: # let's help out with some OR's to provide a less restrictive search expanded_query = " OR ".join(criteria) + " OR '{}'".format(criteria) q = qp.parse(expanded_query) with ix.searcher() as searcher: from whoosh.highlight import UppercaseFormatter, ContextFragmenter results = searcher.search(q) results.fragmenter = ContextFragmenter(maxchars=300, surround=200) results.formatter = UppercaseFormatter() for hit in results: _print_hit(hit)
def query_whoosh(whoosh_dir, queries, weighting=scoring.BM25F(), num_results=50): res_sets = [] # Weighting used for ranking documents ix = index.open_dir(whoosh_dir) # Examine effect of scoring on queries for key terms (and key terms themselves) # Highlight search term in results by making them UPPER CASE formatter = UppercaseFormatter() # Run queries and print results for q in queries: # "new york", "empire state building", "oculus", cur = [] with ix.searcher(weighting=weighting) as searcher: query = QueryParser("body", ix.schema).parse(q) results = searcher.search(query, limit=num_results) results.formatter = formatter print_header("Query: {} returned {} results for {}".format( q, len(results), str(weighting))) # if print_results: for i, result in enumerate(results): cur.append(result['url'].replace('index.txt', '')) print_result(i, result) print() res_sets.append(set(cur)) return res_sets
def search(self, term): results = [] suggestions = [term] + (self.corrector.suggest(term, limit=5)) for t in suggestions: query = self.parser.parse(t) query_res = self.searcher.search(query, limit=100) query_res.fragmenter.maxchars = 300 query_res.fragmenter.surround = 100 query_res.formatter = UppercaseFormatter() results.append((t, query_res)) return results
def search_docstrings(ix, query_string, verbose=0): with ix.searcher() as searcher: query = QueryParser('doc', ix.schema).parse(query_string) results = searcher.search(query) if verbose > 0: results.formatter = UppercaseFormatter() for r in results: print 'Result %d (%g): %s' % (r.rank + 1, r.score, r['name']) if verbose == 2: print r['doc'], '\n' elif verbose == 1: print r.highlights('doc'), '\n'
def testHighlights(autor): """ Busca los fragmentos resaltados a los que pertenecen los términos buscados """ ix = whoosh.index.open_dir("ficheros/index") parser = QueryParser("descripcion", ix.schema) myquery = parser.parse(autor) highs = [] with ix.searcher() as searcher: results = searcher.search(myquery) results.formatter = UppercaseFormatter() for hit in results: if hit["descripcion"] is not None: resalto = hit.highlights("descripcion") highs.append(resalto) else: print "No hay descripcion del cuadro" return highs
def query_whoosh(whoosh_dir, num_results=5): ix = index.open_dir(whoosh_dir) # Examine effect of scoring on queries for key terms (and key terms themselves) # Highlight search term in results by making them UPPER CASE formatter = UppercaseFormatter() # Weighting used for ranking documents weighting = scoring.BM25F() # Run queries and print results for q in ["new york", "empire state building", "oculus"]: with ix.searcher(weighting=weighting) as searcher: query = QueryParser("body", ix.schema).parse(q) results = searcher.search(query, limit=num_results) results.formatter = formatter print_header("Query: {} returned {} results".format(q, len(results))) for result in results: print_result(result) print()
def highlights(palabra, numResultados=None): """ Busca los fragmentos resaltados a los que pertenecen los términos buscados """ ix = whoosh.index.open_dir(indexPath) parser = QueryParser("descripcion", ix.schema) myquery = parser.parse(palabra) highs = [] with ix.searcher() as searcher: results = searcher.search(myquery, limit=numResultados) results.formatter = UppercaseFormatter() for hit in results: print hit if hit["descripcion"] is not None: resalto = hit.highlights("descripcion") cuadro = hitToDict(hit) cuadro["highlight"] = resalto highs.append(cuadro) #print resalto else: print "No hay descripcion del cuadro" return highs
def __init__(self, index_path, language): from whoosh import index as whoosh_index from whoosh.fields import Schema, TEXT, ID from whoosh import qparser from whoosh.highlight import UppercaseFormatter from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer from whoosh.lang import has_stemmer, has_stopwords import os if not has_stemmer(language) or not has_stopwords(language): # TODO Display a warning? analyzer = SimpleAnalyzer() else: analyzer = LanguageAnalyzer(language) self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer)) self.formatter = UppercaseFormatter() self.index_path = index_path if not os.path.exists(index_path): try: os.mkdir(index_path) except OSError as e: sys.exit("Error creating Whoosh index: %s" % e) if whoosh_index.exists_in(index_path): try: self.search_index = whoosh_index.open_dir(index_path) except whoosh_index.IndexError as e: sys.exit("Error opening whoosh index: {0}".format(e)) else: self.search_index = whoosh_index.create_in(index_path, self.schema) self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema) self.query_parser.add_plugin(qparser.FuzzyTermPlugin())
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None): from haystack import site results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result['django_ct'].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if isinstance(index.fields[string_key], MultiValueField): if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(',') else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del(additional_fields['django_ct']) del(additional_fields['django_id']) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] additional_fields['highlighted'] = { self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())], } result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields) results.append(result) else: hits -= 1 if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ',') else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [ term.replace('*', '') for term in query_string.split() ] additional_fields['highlighted'] = { self.content_field_name: [ highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter()) ], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
writer.commit() # Alow user to search from whoosh.qparser import QueryParser from whoosh.highlight import UppercaseFormatter with ix.searcher() as searcher: print('Enter search phrase:', end=' ') querystring = input() parser = QueryParser('content', ix.schema) query = parser.parse(querystring) results = searcher.search(query) results.formatter = UppercaseFormatter() def prCyan(skk): print("\033[96m{}\033[00m" .format(skk)) for hit in results: with open(hit['path']) as f: contents = f.read() fragment = hit.highlights('content', text=contents) print('\n' + fragment) prCyan(hit['path'])
def searchdblp(self, userquery): documentNumber = 1 finalresults = { "searchTerm": userquery, "searchquerydocuments": [], "relatedquerydocuments": [] } similarwords = getSimilarWords( userquery.replace('"', '').replace(" ", "")) queries = [] queries.append(userquery) for similarword in similarwords: queries.append(similarword[0]) with self.ix.searcher() as searcher: queryparser = QueryParser("content", schema=self.ix.schema) for queryString in queries: parsedquery = queryparser.parse(queryString) if queryString == queries[0]: corrected = searcher.correct_query(parsedquery, queryString) print(corrected.string) if corrected.query != parsedquery: print("Did you mean:", corrected.string) results = searcher.search(parsedquery) results.formatter = UppercaseFormatter(between="~") for hit in results: filename = hit["path"] + ".json" filepath = os.path.join(self.dataDirPath, filename) with open(filepath, "r", encoding="utf-8") as f: subresult = {} jsonfile = json.load(f) filecontents = jsonfile['abstract'] pagerank = jsonfile['pagerank'] category = jsonfile['category'] subresult = { "title": hit['title'], "path": "http://127.0.0.1:5000/file/" + hit["path"], "highlights": ' ... '.join([ x.replace("\n", " ") for x in ("".join( hit.highlights("content", filecontents)).split("~")) ]), "pagerank": pagerank, "category": category } documentNumber += 1 if (queryString == userquery): finalresults["searchquerydocuments"].append( subresult) else: finalresults["relatedquerydocuments"].append( subresult) return json.dumps(finalresults, indent=4)