def find(cmd, criteria, reindex=False): from whoosh.qparser import MultifieldParser if reindex: _create_index(cmd.cli_ctx) try: ix = _get_index(cmd.cli_ctx) except ValueError: # got a pickle error because the index was written by a different python version # recreate the index and proceed _create_index(cmd.cli_ctx) ix = _get_index(cmd.cli_ctx) qp = MultifieldParser( ['cmd_name', 'short_summary', 'long_summary', 'examples'], schema=_get_schema() ) if 'OR' in criteria or 'AND' in criteria: # looks more advanced, let's trust them to make a great query q = qp.parse(" ".join(criteria)) else: # let's help out with some OR's to provide a less restrictive search expanded_query = " OR ".join(criteria) + " OR '{}'".format(criteria) q = qp.parse(expanded_query) with ix.searcher() as searcher: from whoosh.highlight import UppercaseFormatter, ContextFragmenter results = searcher.search(q) results.fragmenter = ContextFragmenter(maxchars=300, surround=200) results.formatter = UppercaseFormatter() for hit in results: _print_hit(hit)
def search_results(ix, search_query, fields): qpo = MultifieldParser(fields, schema=ix.schema, group=qparser.OrGroup) qpa = MultifieldParser(fields, schema=ix.schema) qo = qpo.parse(search_query) qa = qpa.parse(search_query) data = [] data_index = 0 with ix.searcher() as s: resultsa = s.search(qa) resultso = s.search(qo) for hit in resultsa: data.append(dict(**hit)) context = str() for field in fields: if(len(hit.highlights(field)) > 0 and hit.highlights(field) not in context): context += re.sub(r"(\(.*[^\)])",r'\1)', hit.highlights(field)) data[data_index]["context"] = context data_index += 1 for hit in resultso: found = False for hita in resultsa: if hit["id"] == hita["id"]: found = True if not found: data.append(dict(**hit)) context = str() for field in fields: if(len(hit.highlights(field)) > 0 and hit.highlights(field) not in context): context += re.sub(r"(\(.*[^\)])",r'\1)', hit.highlights(field)) data[data_index]["context"] = context data_index += 1 return data
class Searcher(object): """ Assigned to a Model class as ``search_query``, which enables text-querying. """ def __init__(self, model_class, primary, index): self.model_class = model_class self.primary = primary self.index = index self.searcher = index.searcher() fields = set(index.schema._fields.keys()) - set([self.primary]) self.parser = MultifieldParser(list(fields), index.schema) def __call__(self, query, limit=None): """API similar to SQLAlchemy's queries. """ session = self.model_class.query.session results = self.index.searcher().search(self.parser.parse(query), limit=limit) keys = [x[self.primary] for x in results] if not keys: # Dummy request... return session.query(self.model_class).filter("uid = -1") else: primary_column = getattr(self.model_class, self.primary) return session.query(self.model_class).filter(primary_column.in_(keys)) def search(self, query, limit=None): """New API: returns both whoosh records and SA models.""" # TODO: highly suboptimal session = self.model_class.query.session hits = self.index.searcher().search(self.parser.parse(query), limit=limit) for hit in hits: yield (hit, session.query(self.model_class).get(hit[self.primary]))
def getdocs(): params = dict(request.args.items()) search_terms = params['NPS'].split(quails.DELIMITER) try: ix = index.open_dir("indexQ") except: return jsonify(failure="Index not found. Ensure that index exists and tries again.") qp = MultifieldParser(["title","body"], schema=ix.schema) queries = [] for term in search_terms: queries.append(qp.parse(term)) docs = OrderedDict() hit_list = [] with ix.searcher() as searcher: for query in queries: results=searcher.search(query) for result in results: hit_list.append((str(query),result['title'])) return jsonify(results=hit_list)
def search_documents(filter): results = None # Check for existing index dir_path = os.path.join(DATA_DIR, 'index') if not os.path.exists(dir_path) or not Index.exists_in(dir_path): return None index = Index.open_dir(dir_path) if filter.startswith('tags:'): fields = ['tags'] filter = filter[5:] else: fields = ['path', 'content'] parser = MultifieldParser(fields, schema=index.schema) search_query = parser.parse(unicode(filter)) # Try documents search try: searcher = index.searcher(closereader=False) return searcher.search(search_query, collapse=[sorting.FieldFacet('path'), sorting.FieldFacet('content')], collapse_order=sorting.FieldFacet('revision', reverse=True), sortedby=[sorting.FieldFacet('path'), sorting.FieldFacet('date', reverse=True)] ) finally: searcher.close() return results
def parse(text, schema=SCHEMA): """ parse(text[, schema=SCHEMA]) Analisa e trata o texto em ``text`` de acordo com o ``schema`` do índice de documentos. .. code-block:: python >>> from storyline.engine.query import parse >>> from storyline.engine.schema import get_schema >>> >>> SCHEMA = get_schema() >>> parse("Mestre", SCHEMA) Or([Term('title', u'mestr'), Term('content', u'mestr')]) :param text: Consulta feita pelo usuário. :type text: str :param schema: Schema do índice de documentos. :type schema: Schema :returns: Query com termos e operadores. """ try: from whoosh.qparser import MultifieldParser except ImportError: print "Ocorreu um erro na importação do módulo whoosh.qparser." qp = MultifieldParser(["title", "content"], schema, None) return qp.parse(text)
def live_search(self, query): """live search on ngram field""" with self.ix.\ searcher(weighting=scoring.BM25F(title_B=2)) as searcher: qp = MultifieldParser(self.live_search_field + self.search_field, schema=self.ix.schema) q = qp.parse(query) results = searcher.search(q, limit=25).copy() res = {'estimated_length': results.estimated_length(), 'scored_length': results.scored_length(), 'runtime': results.runtime, 'list': []} for i, r in enumerate(results): if 'id' in r and 'space' in r: url = url_for('document.view', space=r['space'], doc_id=r['id']) else: url = None res['list'].append({'id': r.get('id', ''), 'space': r.get('space', ''), 'title': r.get('title', ''), 'rank': r.rank, 'url': url, 'score': results.score(i)}) return res
def search(querytext, request, pagenum=1, maxresults=30, staff=False, scope=None, orderby='-creation_date'): search_engine = get_search_engine('resource') search_result = {} if pagenum < 1: pagenum = 1 with search_engine.searcher() as searcher: parser = MultifieldParser(search_engine.default_search_fields, searcher.schema) user_q = querytext and parser.parse(querytext) or Every() user_q, search_kwargs = build_search_kwargs(user_q, request, scope, staff, orderby) hits = searcher.search(user_q, limit=(pagenum * maxresults) + 1, **search_kwargs) if querytext and hits.is_empty(): correction_q = parser.parse(querytext) corrected = searcher.correct_query(correction_q, querytext) if corrected.query != correction_q: querytext = corrected.string search_result['corrected_q'] = querytext user_q, search_kwargs = build_search_kwargs(corrected.query, request, scope, staff, orderby) hits = searcher.search(user_q, limit=(pagenum * maxresults), **search_kwargs) search_engine.prepare_search_response(search_result, hits, pagenum, maxresults) search_result['results'] = add_other_versions(searcher, search_result['results'], request.user, staff) add_absolute_urls(search_result['results'], request) return search_result
def search(self, query): """ general search function for a query string """ hit_docs = [] index_dir = "D:/bjstinfo_index" # deprecated. we should use variable or configure file. if not os.path.exists(index_dir): print "Error: indexer doesn't exist!" sys.exit(1) ix = index.open_dir(index_dir) # For keywords query, we search multi-fields of documents as: # Title, Keywords, Abstract. give the query-time fieldsboost: # {"Title": 1.2, "Keywords": 1.1, "Abstract": 1.0} query_fields = ['Title', 'Keywords', 'Abstract'] field_boosts = {'Title':1.2, 'Keywords':1.1, 'Abstract':1.0} qp = MultifieldParser(query_fields, schema=ix.schema, fieldboosts=field_boosts) q = qp.parse(query) with ix.searcher() as s: results = s.search(q, limit=50, terms=True) # my_cf = ContextFragmenter(maxchars=100, surround=30) #custome fragmenter. # results.fragmenter = my_cf # my_score = StandarDeviationScorer(my_cf) #custome scorer. # results.scorer = my_score # results.formatter = HtmlFormatter() for hit in results: # print hit.fields() hit_docs.append(hit.fields()) # why just cannot implement the highlight function? # print hit.highlights('Abstract', top=20) return hit_docs
def search(querystring, language_code): ix = LanguageIndex(settings.WHOOSH_INDEX_PATH, language_code, _get_schema()).load() # parser = QueryParser('content', ix.schema) parser = MultifieldParser(['title', 'keywords', 'content'], ix.schema) # fieldboosts={'title':5, 'keywords':4, 'content':1}) parser.remove_plugin_class(WildcardPlugin) # remove unused feature for better performance query = parser.parse(querystring) # print(parser, query, querystring) result = { 'results': [], } with ix.searcher() as searcher: results = searcher.search(query) # print(results) # import pdb; pdb.set_trace() # collect results for hit in results: my_hit = {} # my_hit['pos'] = hit.pos # my_hit['rank'] = hit.rank # my_hit['docnum'] = hit.docnum my_hit['score'] = hit.score my_hit['object'] = Article.objects.get(code=hit.fields()['code']) #.exclude(published=False).exclude(release_date__gte=datetime.today()) # my_hit['object']['is_visible'] = True result['results'].append(my_hit) # print(hit.pos, hit.rank, hit.docnum, hit.score, hit) return result
def search(self): c.terms = request.GET.get('terms', '') c.results = [] if len(c.terms) < 4: h.flash( _('Search queries must be at least 4 characters in length.'), 'error' ) redirect(url(controller='blog', action='index')) query = MultifieldParser( ['title', 'content', 'summary'], schema=index.schema ).parse(c.terms) results = index.searcher().search(query, limit=10) for result in results: terms = [v for k, v in query.all_terms() if k == 'content'] url_kwargs = json.loads(result['url']) result['url'] = url(**url_kwargs) result['highlights'] = highlight( result['content'], terms, search.schema['content'].format.analyzer, ContextFragmenter(terms), HtmlFormatter(tagname='span', classname='highlight') ) c.results.append(result) return render('search.tpl', slacks=True)
def search_commodity(): from shop import app ix = open_dir(app.config.get("INDEX_DIR")) searcher = ix.searcher() mparser = MultifieldParser(["content", "title"], schema=ix.schema) query_raw = request.args.get('q', '') if query_raw: query = mparser.parse(unicode(query_raw.lower())) results = searcher.search(query) result_id = [] for result in results: result_id.append(int(result['id'])) result_id = list(set(result_id)) wq = None for rid in result_id: if not wq: wq = Q(id=rid) else: wq |= Q(id=rid) if wq: coms = Commodity.select().where(wq) else: coms = [] else: coms = Commodity.select() category = int(request.args.get('c', '0')) if category and category != 1: coms = [c for c in coms if c.is_category(category)] return render_template('core/com_list.html', commodities=coms)
def search(q, limit=None): # q = str(q) ix = open_dir(DIRECTORY, NAME) with ix.searcher() as searcher: qp = MultifieldParser(fieldnames=['title', 'author', 'tags', 'notes', 'text', 'source', # 'cached', 'year'], fieldboosts={'title': 7, 'year': 6, 'author': 10, 'tags': 4, 'notes': 2, 'text': 1}, schema=ix.schema) # Whoosh chokes on queries with stop words, so remove them. q = remove_stopwords(q) q = qp.parse(q) for hit in searcher.search(q, limit=limit): yield hit
def __call__(self, query, limit=None, fields=None, or_=False): if fields is None: fields = self._all_fields group = OrGroup if or_ else AndGroup parser = MultifieldParser(fields, self._index.schema, group=group) return self._index.searcher().search(parser.parse(query), limit=limit)
def keywords(request): query = request.GET.get('q', '') if not query: return render(request, 'search/keywords.html', {'page_name': 'search.keywords'}) qtext = get_tokenized_query(query) print qtext idx_dir = os.path.join(settings.BASE_DIR, 'search/lagou_idx') ix = open_dir(idx_dir) searcher = ix.searcher() parser = MultifieldParser(["name", "com_name", 'city'], schema=ix.schema) q = parser.parse(qtext) plen = 100 results = searcher.search(q, limit=plen) total = len(results) got = results.scored_length() numterms = 100 if got < 10: numterms = 10 elif got < 100: numterms = 50 keywords = [(kw, score) for kw, score in results.key_terms("desc", docs=got, numterms=numterms)] return render(request, 'search/keywords.html', {'page_name': 'search.keywords', 'query': query, 'total': total, 'got': got, 'keywords': keywords, })
def search(self, query, *args, **kwargs): parser = MultifieldParser(fieldnames=('content','title','headings','url'), schema=self.ix.schema, fieldboosts={'content':1,'title':2,'headings':3,'url':1}) qry = parser.parse(query) search = self.ix.searcher() # with self.ix.searcher() as searcher: return search.search_page(qry, *args, **kwargs)
def search(self, query_string, index, parser=None, **kwargs): index = base._resolve_index(index) if parser is None: parser = MultifieldParser(fieldnames=index.get_searchable_fieldnames(), schema=index.get_schema()) query = parser.parse(query_string) return self._search(query, index, **kwargs)
def get_whoosh_parser(index): from whoosh.qparser import MultifieldParser, GtLtPlugin # TODO: only active columns term_fields = ['content', 'unitid'] parser = MultifieldParser(term_fields, index.schema) parser.add_plugin(GtLtPlugin) return parser
def search(self, term): if not self.index: self.load_index() parser = MultifieldParser(("body", "title", "tags"), schema=self.schema) query = parser.parse(term) results = self.searcher.search(query, limit=100) # , sortedby="date", reverse=True) return results
def search( self, trans, search_term, page, page_size, boosts ): """ Perform the search on the given search_term :param search_term: unicode encoded string with the search term(s) :returns results: dictionary containing number of hits, hits themselves and matched terms for each """ tool_index_dir = os.path.join( trans.app.config.whoosh_index_dir, 'tools' ) index_exists = whoosh.index.exists_in( tool_index_dir ) if index_exists: index = whoosh.index.open_dir( tool_index_dir ) try: # Some literature about BM25F: # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf # http://en.wikipedia.org/wiki/Okapi_BM25 # __Basically__ the higher number the bigger weight. tool_weighting = scoring.BM25F( field_B={ 'name_B' : boosts.tool_name_boost, 'description_B' : boosts.tool_description_boost, 'help_B' : boosts.tool_help_boost, 'repo_owner_username_B' : boosts.tool_repo_owner_username_boost } ) searcher = index.searcher( weighting=tool_weighting ) parser = MultifieldParser( [ 'name', 'description', 'help', 'repo_owner_username' ], schema=tool_schema ) user_query = parser.parse( '*' + search_term + '*' ) try: hits = searcher.search_page( user_query, page, pagelen=page_size, terms=True ) except ValueError: raise ObjectNotFound( 'The requested page does not exist.' ) log.debug( 'searching tools for: #' + str( search_term ) ) log.debug( 'total hits: ' + str( len( hits ) ) ) log.debug( 'scored hits: ' + str( hits.scored_length() ) ) results = {} results[ 'total_results'] = str( len( hits ) ) results[ 'page'] = str( page ) results[ 'page_size'] = str( page_size ) results[ 'hits' ] = [] for hit in hits: hit_dict = {} hit_dict[ 'id' ] = hit.get( 'id' ) hit_dict[ 'repo_owner_username' ] = hit.get( 'repo_owner_username' ) hit_dict[ 'repo_name' ] = hit.get( 'repo_name' ) hit_dict[ 'name' ] = hit.get( 'name' ) hit_dict[ 'description' ] = hit.get( 'description' ) results[ 'hits' ].append( {'tool': hit_dict, 'matched_terms': hit.matched_terms(), 'score': hit.score } ) return results finally: searcher.close() else: raise exceptions.InternalServerError( 'The search index file is missing.' )
def search(ix, query_string, sortedby=None, limit=10): mp = MultifieldParser(["title", "summary"], schema=ix.schema) s = ix.searcher() keywords = split_keywords(query_string) user_q = mp.parse(' OR '.join(keywords)) # TODO: add query filter results = s.search(user_q, sortedby=sortedby, limit=limit) return results
def search(self, search_key): ix = self.getIndex() parser = MultifieldParser(["book", "chapter", "verse", "verse_text"], schema=ix.schema) query = parser.parse(search_key) searcher = ix.searcher() result = searcher.search(query, limit=1000) return self.formatSearchResult(result)
def search( self, query, return_attribute='id' ): # Change field boosts for searcher to place more weight on title, description than help. searcher = self.index.searcher( \ weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \ ) ) # Set query to search title, description, and help. parser = MultifieldParser( [ 'title', 'description', 'help' ], schema = schema ) results = searcher.search( parser.parse( query ) ) return [ result[ return_attribute ] for result in results ]
def init(): # Setting my schema ... schema_email = Schema( path=TEXT(stored=True), sender_email=TEXT(stored=True), recipient_emails=TEXT, date=DATETIME, subject=TEXT(stored=True), body=TEXT, ) schema_book = Schema(email=TEXT(stored=True), name=TEXT(stored=True)) schemas = {"index_emails": schema_email, "index_book": schema_book} if not os.path.exists(index_path): os.mkdir(index_path) indexes = {} for ixname, schema in schemas.items(): """ Esta parte es mejorable, ya que sólo indexa si no existe indice. No tiene en cuenta si los archivos indexados se han modificado o si se han eliminado como se explica aquí: @url http://pythonhosted.org/Whoosh/indexing.html#incremental-indexing """ exists = index.exists_in(index_path, indexname=ixname) if not exists: ix = index.create_in(index_path, schema, indexname=ixname) # Indexing ... ix = index.open_dir(index_path, indexname=ixname) writer = ix.writer() if ixname == "index_emails": files = read_dir() index_emails(files, writer) elif ixname == "index_book": index_book(writer) else: ix = index.open_dir(index_path, indexname=ixname) indexes[ixname] = ix # Main routine while True: ix = indexes.get("index_emails") with ix.searcher() as searcher: input_user = str(raw_input("Introduzca una palabra del asunto o cuerpo (p.e. contrato): ")) mparser = MultifieldParser(["subject", "body"], schema=ix.schema) myquery = mparser.parse(unicode(input_user)) results = searcher.search(myquery) print "==================================================" for result in results: # read_file(result.get("path")) print ("Remitente: " + findNameBySender(indexes, result.get("sender_email"))) print ("Asunto: " + result.get("subject")) print "=================================================="
def answer_query(query): with main_index.searcher() as searcher: parser = MultifieldParser(['title', 'summary'], main_index.schema, fieldboosts={'title': 5.0, 'summary': 0.2}) parser.add_plugin(FuzzyTermPlugin()) # tilde adds fuzzy parsing for 1 character and /1 requires the first letter to match query = parser.parse(unicode(query) + '~/1') results = searcher.search(query, limit=100) tags = [r['tag'] for r in results] return tags
def search_my_archive(query_str): my_index = open_dir(conf.PATH_INDEX_ARCHIVE) with my_index.searcher() as searcher: mparser = MultifieldParser(['content','retweet'], schema=my_index.schema) query = mparser.parse(query_str) results = searcher.search(query) result_list = [entry['feed_id'] for entry in results] with open(conf.PATH_ARCHIVE_JSON,'r') as f: feeds = json.loads(f.read()) return [feed for feed in feeds if str(feed['id']) in result_list]
def term_search(self, query): terms = [] if query.get('term'): parser = MultifieldParser(self.term_fields, schema=self.index.schema) terms.append(parser.parse(unicode(query.pop('term')[0]))) for key in query.keys(): terms.append(Or([ Term(key, unicode(t)) for t in query.pop(key) ])) with self.searcher() as searcher: for entry in searcher.search(And(terms), limit=None): yield entry.fields()
def lookup(self, term, fuzzy=False, limit=None): term = term.strip() term = term.lower() if limit: limit = limit else: limit = self.RESULTS_LIMIT fields = ( 'indice', 'indice_game', 'name', 'name_jp', 'game', 'version', 'classification', 'element', 'code', 'size', 'damage_min', 'damage_max', 'recovery', 'rarity' ) if fuzzy: parser = MultifieldParser( fields, schema=self.index.schema, termclass=FuzzyTerm ) else: parser = MultifieldParser(fields, schema=self.index.schema) operators = OperatorsPlugin( And="&", Or="\\|", AndNot="&!", AndMaybe="&~", Not="\\-" ) parser.replace_plugin(operators) query = parser.parse(term) results = [] try: searcher = self.index.searcher() results = searcher.search(query, limit=limit) if not results and not fuzzy: # Try a Fuzzy Search. return self.lookup(term, fuzzy=True, limit=self.FUZZY_LIMIT) except IndexError: pass return results
def render_GET(self, request): section_path = '/'.join(request.postpath).strip('/') if not section_path: defer.returnValue(json.dumps({'status': 'error', 'message': 'unable to search root'})) section_name = request.postpath[0] ix = self._get_index(section_path) if not ix: defer.returnValue(json.dumps({'status': 'error', 'message': 'unknown index for %s' % section_path})) schema_settings = self._get_schema_settings(section_path) schema = schema_settings['schema'] if 'schema' in request.args: if section_path in self.currently_indexing: yield self.currently_indexing[section_path] field_choices = schema_settings.get('field_choices', {}) fields = {} for field in set(schema.names()): if isinstance(schema[field], KEYWORD) and field in field_choices: fields[field] = sorted(x for x in field_choices[field] if x) defer.returnValue(json.dumps({'status': 'ok', 'schema': fields, 'type': schema_settings['type']})) if 'q' not in request.args: defer.returnValue(json.dumps({'status': 'error', 'message': 'missing q argument in url'})) q = unicode(request.args['q'][0]) parser = MultifieldParser(['search_field'], schema=schema) parser.add_plugin(GtLtPlugin()) query = parser.parse(q) with ix.searcher() as searcher: results = yield threads.deferToThread(searcher.search, query, limit=10000) #corrected = searcher.correct_query(query, q) # jesus this is bad for titles results = [x['linkitem'] for x in results] section = settings.SECTIONS[section_name] rootfolder = RootFolder(parent_path='', name='Search result for: %s' % q, urlname=self.name, date=0) rootfolder['content_type'] = section.levels[0].content_type for result in results: rootfolder.add_item(result) #if corrected.query != query: # retval['suggestion'] = { # 'rel': 'suggested_query', # 'href': urlparse.urljoin(settings.BASE_URL, '/search/%s' % urllib.quote(section_path)) + '?%s' % urllib.urlencode({'q': corrected.string}), # 'suggested_query': corrected.string, # } defer.returnValue(rootfolder.serialize())
def _create_parser(self, context): parser = MultifieldParser( self.field_boosts.keys(), WhooshBackend.SCHEMA, fieldboosts=self.field_boosts ) parser.add_plugin( MetaKeywordPlugin(meta_keyword_parsers=self.meta_keyword_parsers, context=context) ) return parser
def searchterm(): search_term=request.form['search_term'] ix = open_dir("indexdir") with ix.searcher() as searcher: query = MultifieldParser(["title","author","secondauthor","publication"], ix.schema).parse(search_term) results = searcher.search(query, limit=20) result_dict = {} for result in results: headline_jpeg = "imgs/publications/pub" + result['publication'] + "/" + result['path'].split("/")[-1] + ".jpg" result_dict[result['path']] = [result['title'],result['author'],result['publication'], headline_jpeg] return render_template('form_action.html', search_terms=result_dict, orig_search = search_term)
def preform_whoosh_search(query, ix=None, fields=None, page=None, per_page=None, sortedby=[], reverse=True, **kwargs): """ Query the indexed, looking for a match in the specified fields. Results a tuple of results and an open searcher object. """ per_page = per_page or settings.SEARCH_RESULTS_PER_PAGE fields = fields or [ 'tags', 'title', 'author', 'author_uid', 'content', 'author_handle' ] ix = ix or init_index() searcher = ix.searcher() # Splits the query into words and applies # and OR filter, eg. 'foo bar' == 'foo OR bar' orgroup = OrGroup parser = MultifieldParser(fieldnames=fields, schema=ix.schema, group=orgroup).parse(query) if page: # Return a pagenated version of the results. results = searcher.search_page(parser, pagenum=page, pagelen=per_page, sortedby=sortedby, reverse=reverse, terms=True) results.results.fragmenter.maxchars = 100 # Show more context before and after results.results.fragmenter.surround = 100 else: results = searcher.search(parser, limit=settings.SEARCH_LIMIT, sortedby=sortedby, reverse=reverse, terms=True) # Allow larger fragments results.fragmenter.maxchars = 100 results.fragmenter.surround = 100 #logger.info("Preformed index search") return results
def search(): print(request.args) search = request.args.get('search') author = request.args.get('author') category = request.args.get('category') page = int(request.args.get( 'page')) if not request.args.get('page') is None else 1 print(search) if search is None and author is None and category is None: myquery = Every() else: if search is None: if not author is None: myquery = Term('author', author) if not category is None: myquery = myquery & Term('category', category) else: myquery = Term('category', category) else: myquery = MultifieldParser(["title", "post_content"], ix.schema, plugins=[FuzzyTermPlugin() ]).parse(search) if not author is None: myquery = myquery & Term('author', author) if not category is None: myquery = myquery & Term('category', category) with ix.searcher() as searcher: results = searcher.search_page(myquery, page, pagelen=25, sortedby="date", reverse=True) print(results.is_last_page()) results_json = json.dumps( { "results": [dict(i) for i in results], "page": page, "total_results": results.total }, default=str) resp = Response(response=results_json, status=200, mimetype="application/json") return resp
def search_whoosh_index(query, offset=0, limit=10, *args, **kwargs): ix = get_whoosh_index() parser = MultifieldParser(['content', 'authors', 'tags', 'title', 'abstract'], ix.schema) # user query q = parser.parse(query) if not query: q = Every() print 'arch' allow_q = And([Term(key, value) for key, value in kwargs.iteritems()]) # parse remaining args res = [] count = 0 offset = int(offset) limit = int(limit) right = offset + limit # restrict_q = Or([Term("path", u'%s' % d.id) for d in qs]) #print 'query', q, allow_q, kwargs with ix.searcher() as searcher: # From WHOOSH documentation: # > Currently, searching for page 100 with pagelen of 10 takes the same amount of time as using Searcher.search() # to find the first 1000 results results = searcher.search(q, filter=allow_q, limit=right, terms=True) count = len(results) for hit in list(results)[offset:]: res.append({ # 'title': hit['title'], 'short_url': hit['path'], 'highlights': hit.highlights("content", top=5) }) # @todo filter by empty highlight strings return { 'results': res, 'count': count }
def search(page): search = request.args['q'] storage = FileStorage(conf.INDEX_DIR) index = storage.open_index(indexname=conf.INDEX_NAME) qp = MultifieldParser(['title', 'text', 'tags'], schema=index.schema) q = qp.parse(search) results = [] with index.searcher() as searcher: results = searcher.search_page(q, page, pagelen=conf.PAGE_SIZE) # Get real posts post_ids = ",".join(["'%s'" % p['post_id'] for p in results if not p['post_id'].startswith('static-')]) if post_ids: ghost = get_voyage_connection() with ghost.cursor() as ghost_cur: query = "SELECT title, feature_image, html, slug FROM posts WHERE id IN (%s) ORDER BY published_at DESC" % post_ids ghost_cur.execute(query) posts = [{'type': "post", 'title': i[0], 'image': i[1], 'excerpt': excerpt(i[2]), 'url': "/blog/" + i[3]} for i in ghost_cur.fetchall()] ghost.close() else: posts = [] # Get static pages for p in results: if p['post_id'].startswith('static-'): page_key = p['post_id'].replace("static-", "") page = conf.STATIC_TPL[page_key] with open('templates/' + page['tpl_file'], "r") as p: page_text = p.read() posts.append({'type': 'static-page', 'title': page['title'], 'excerpt': excerpt(page_text), 'image': page.get('image'), 'url': page['url']}) return render_template("search.html", posts=posts, search=search)
def search(self, query, page=1, pagelen=20): """Return a sorted list of results. pagelen specifies the number of hits per page. page specifies the page of results to return (first page is 1) Set pagelen = None or 0 to retrieve all results. """ query = unicode(query) # Must be unicode population_sort_facet = sorting.FieldFacet("population", reverse=True) ix = whoosh_open_dir_32_or_64(self.index_dir) with ix.searcher() as searcher: # query = QueryParser("ngram_name", ix.schema).parse(query) mparser = MultifieldParser( ["ngram_name", "admin1_code", "country_code"], schema=ix.schema) query = mparser.parse(query) if pagelen is not None and pagelen != 0: try: results = searcher.search_page(query, page, pagelen=pagelen) except ValueError, e: # Invalid page number results = [] else:
def perform_search(query="*", page=1): if query.strip() in {"", "*"}: return full(page=page) query = query.replace("v", "u").replace("V", "u").replace("j", "i").replace("J", "I") qp = MultifieldParser(["lemma", "Description", "Variation"], schema=search_index.schema) q = qp.parse(query) with search_index.searcher() as s: results = s.search_page(q, pagenum=page, pagelen=PAGELEN) out = { "pages": { "last": results.pagecount, "current": results.pagenum, "is_last": results.is_last_page() }, "results": [ dict(res) for res in results ] } return out
def search_index(query_string, page): """ Search index based on the query :param query_string: query :param page: requested page :return: tuple: results total, results on page, ids of videos on page """ index = open_index() with index.searcher() as searcher: query = MultifieldParser(settings.INDEX_SEARCH_FIELDS, index.schema, group=OrGroup).parse(query_string) results = searcher.search_page(query, pagenum=page) return [hit['id'] for hit in results], results.total, results.pagecount
def search(self, query): """ Let's send a query to the shelf. The query is a dict of keys, with associated values. """ qp = MultifieldParser(query.keys(), schema=self.index.schema) # Let's assemble the query search_terms = '' for k in query: search_terms += '%s:%s ' % (k, query[k]) # We need to parse the query if u'*' in [query[k] for k in query]: # We have a query asking for every doc q = Every() else: q = qp.parse(search_terms) # And now, we search results = self.index.searcher().search(q, limit=None) # We just have to return the results return results
def searchPost(keyword, ForumId=None): """ Search posts with keyword [IN]: keyword [OUT]: list of post IDs """ # q = Or([Term("Title", unicode(keyword)), Term("Body", unicode(keyword))]) parser = MultifieldParser(["Title", "Body", "Tags"], schema=Post_Schema) s_parser = parser.parse("Title OR beta gamma") words="" for k in keyword: words += k+" and " q = parser.parse(words[0:-5]) allow_q = Term("ForumId", str(ForumId)) if ForumId else None print q ix = open_dir(INDEX_DIR, indexname=POST_INDEX) results = [] with ix.searcher() as searcher: hits = searcher.search(q, filter=allow_q, limit=None) for hit in hits: results.append((hit["ForumId"], hit["Id"])) return results
def search(indexer, searchTerm, searchColumns): with indexer.searcher() as searcher: words = searchColumns query = MultifieldParser(words, schema=indexer.schema).parse(searchTerm) results = searcher.search(query, limit=None) print("Length of results: " + str(len(results))) result = [[], [], [], []] for line in results: result[0].append(line['Access_Name']) result[1].append(line['County']) result[2].append(line['Type']) result[3].append(line['Location']) return result
def build_keywords_query(keywords): """ Build parsers for a query. :param MultiDict keywords: The search texts keyed by scope key. If empty, the query will match every documents. """ queries = [] if keywords: composer = current_app.config['KERKO_COMPOSER'] text_plugins = [ plugins.PhrasePlugin(), plugins.GroupPlugin(), plugins.OperatorsPlugin( And=r"(?<=\s)" + re.escape(gettext("AND")) + r"(?=\s)", Or=r"(?<=\s)" + re.escape(gettext("OR")) + r"(?=\s)", Not=r"(^|(?<=(\s|[()])))" + re.escape(gettext("NOT")) + r"(?=\s)", AndNot=None, AndMaybe=None, Require=None), plugins.BoostPlugin(), ] for key, value in keywords.items(multi=True): fields = [ spec.key for spec in composer.fields.values() if key in spec.scopes ] if not fields: raise KeyError # No known field for that scope key. parser = MultifieldParser(fields, schema=composer.schema, plugins=text_plugins) queries.append(parser.parse(value)) else: queries.append(Every()) return And(queries)
def post(self): song_detail = self.get_argument('query') ix = index.open_dir(indexdir_path) facet = sorting.FieldFacet("comment_num", reverse=True) searcher = ix.searcher() qp = MultifieldParser(["artist_name", 'lyrics', 'music_name'], schema=ix.schema) q = qp.parse(song_detail) results = searcher.search(q, sortedby=facet) for r in results: print(r.fields()) song_num = len(results) if song_num == 1: song_num = '0' self.render('section4.html', results=results, song_detail=song_detail, song_num=song_num)
def searchWhoosh(request): ix = open_dir("whooshdir") qp = MultifieldParser(['brand', 'name', 'category', 'price'], schema=ix.schema, group=OrGroup) q = qp.parse(request.GET.get('query')) with ix.searcher() as searcher: #Gets the top X results for the query where X=query_limit results = searcher.search(q, limit=int(request.GET.get('query_limit'))) print("{} products".format(len(results))) results_json = [] for r in results: #product = r['brand']+" - "+r['name']+" - "+r['category']+" - "+str(r['price'])+"€" product = [ r['sku'], r['image'], r['brand'], r['name'], r['category'], r['price'] ] results_json.append(product) print('--------------END SEARCH--------------') print(results_json) mimetype = 'application/json' return HttpResponse(json.dumps(results_json), mimetype)
def search(qstring, index_folder="index_fullname"): index = whoosh.index.open_dir(os.path.join(dirname, index_folder)) schema = index.schema qp = MultifieldParser(["fullname"], schema=schema) q = qp.parse(unicode(qstring)) with index.searcher() as searcher: searchResult = searcher.search(q, limit=20) # result = {r["fullname"] for r in searchResult} ids = [{ "rank": index_rank, "id": r["id"] } for index_rank, r in enumerate(searchResult)] # ids = {r for r in searchResult} corrector = searcher.corrector("fullname") suggestions = [] if len(ids) == 0: suggestions = corrector.suggest(qstring, limit=6) # suggestionResults = {s["fullname"] for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)} # result = result.union(suggestionResults) # ids_suggestion = [s["id"] for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)] # ids_suggestion = {s for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)} # ids = ids+ids_suggestion return {"ids": list(ids), "suggestions": suggestions}
def simple_query(query, or_group=False, page = 0, n = 10): """ Performs a simple keyword query using `query` through whoosh. This, by default, will look at all 3 major text based fields. (name, rules text, flavor text.) :param str query: the input query :param bool or_group: specifies whetehr to use a AND grouping or an OR grouping. :param int page: the page to return. :param int n: how many results should be in the return set. :return: Exact class TBD, will provide way to iterate over the page's worth of results. """ ix = get_whoosh_index() # fix `page` and `n` (they may be string versions of ints) page = int(page) n = int(n) # parse against the main text fields (note: subtypes is here to help aid in "tribe" searches, and boost planeswalker results) qparser = MultifieldParser(['rules_text', 'name', 'flavor_text'], ix.schema, group = OrGroup if or_group else AndGroup) query = qparser.parse(query.lower()) # all text fields are lowered in whoosh, so do same here with ix.searcher() as searcher: # Quick note: whoosh expects pages to start with 1, so we'll take page+1 results = searcher.search_page(query, page+1, pagelen=n) return [x['data_obj'] for x in results]
def search_clicked(): fc_btn.grid_remove() global current_page if len(displayed_results) != 0: page_clear(True) q = MultifieldParser(['title', 'body'], schema=ix.schema) search_keyword = txt.get().lower() search_keyword = preProcess(search_keyword) r = q.parse(search_keyword) with ix.searcher(weighting=wBM25) as searcher: results = searcher.search(r, limit=30) for r in results: url = str(r['url']) label_num = Label(window, text='0.') button_url = Button(window, text=r['url'][30:], command=callback(url)) displayed_results.append((label_num, button_url)) current_page = 0 select_page() if isMispelled(txt.get()): fc_btn.grid(columnspan=10, row=2, sticky='ew') fc_btn.configure(text='Forse cercavi: ' + correct(txt.get()))
def search_frequency(**kwargs): ix = open_dir(INDEX_PATH_BASIC) with ix.searcher(weighting=Frequency) as searcher: parser = MultifieldParser(['question', 'answer'], ix.schema) # myquery = parser.parse(kwargs['query']) arr_term = [] for word in kwargs['query'].split(' '): arr_term.append(Or([Term('question', word), Term('answer', word)])) myquery = And(arr_term) results = searcher.search(myquery, limit=None) for result in results: yield result['id']
def search_in_index(search_kw, index): ''' search_kw: ce que rentre l'utilisateur dans la barre de recherche index: l'index ouvert (objet ix dans le code qui précède) La fonction renvoie un dictionnaire avec pour clefs: - results: une liste contenant des dictionnaires. Chaque dictionnaire correspond à un résultat de recherche. Le premier élément de la liste est le meilleur résultat. Les dictionnaires on deux clefs: 'title' avec le titre du doc, et 'path' avec le chemin (vers le doc texte, pour le moment) - suggestions: dictionnaire de suggestion qui propose une éventuelle correction pour chaque mot entré par l'utilisateur. A voir comment on mélange les suggestions des différents mots pour fournir des suggestions complètes ''' #on utilise un MultifieldParser pour rechercher à la fois dans le titre et dans le contenu parser = MultifieldParser(["content", "title"], index.schema) #on rajoute un plugin de FuzzyMatching pour pouvoir chercher au delà des mots exacts parser.add_plugin(FuzzyTermPlugin()) searcher = index.searcher() #on transforme la requête utilisateur pour la mettre en format compréhensible par le plugin de FuzzyMatching to_parse = ' '.join([i + '~1' for i in search_kw.split(' ')]) myquery = parser.parse(to_parse) #on récupère les résultats pour pouvoir fermer le searcher par la suite r = searcher.search(myquery) results = [] for res in r: results.append({'title': res['title'], 'path': res['path']}) #on set-up le correcteur et on stock ce qu'il propose pour chaque mot tapé corrector = searcher.corrector("content") suggestions = {} for kw in search_kw.split(' '): suggestions[kw] = corrector.suggest(kw) #on ferme le seacher searcher.close() return {'results': results, 'suggestions': suggestions}
def searchh(indexer, searchTerm): with indexer.searcher() as searcher: words = ['Access_Name'] query = MultifieldParser(words, schema=indexer.schema).parse(searchTerm) results = searcher.search(query) print("\nLength of results: " + str(len(results)) + '\n') result = [] scm = ['Access_Name', 'URL', 'imgURL', 'County', 'Type', 'Location', 'Access_Type', 'Path_to_Beach', 'Managed_by'] scm = scm + ['Parking', 'Fee', 'Bathrooms', 'Handicap_Access', 'Running_Water', 'Showers', 'Camp_Sites', 'Stairs_to_Beach', 'Boat_Ramps', 'Tidepooling'] scm = scm + ['Surfing', 'Hiking', 'Bicycling', 'Horseback_Riding', 'Road_Vehicle_Access', 'Whale_Watching'] if(len(results)>0): for x in scm: result.append(results[0][x]) return result
def search(self, words): ix = open_dir(self.indexdir) searcher = ix.searcher() # And[Or[Term(title: 'aa'), Term(content:'aa')], Or[Term(title: 'bb'), Term(content:'bb')]] mparser = MultifieldParser(self.fieldNames, schema=self.schema) q = mparser.parse(" and ".join(words)) # qs = [] # # 搜索的关键词都必须得出现,出现在哪个字段都行 # for w in words: # qs.append(Or([Term(fn, w) for fn in self.fieldNames])) # q = And(qs) # 搜索第1页,每页20条 results = searcher.search_page(q, 1, pagelen=3) # results = searcher.find("title", u"文档") # 检索出来的第一个结果,数据格式为dict{'title':.., 'content':...} # firstdoc = results[0].fields() # # # python2中,需要使用json来打印包含unicode的dict内容 # jsondoc = json.dumps(firstdoc, ensure_ascii=False) # # print(jsondoc) # 打印出检索出的文档全部内容 # print(results[0].highlights("title")) # 高亮标题中的检索词 # print(results[0].score) # bm25分数 for r in results: doc = r.fields() print(doc.values()) print("bm25分数: %f" % r.score) # for field in self.fieldNames: # print(r.highlights(field)) print("总共记录数: %i" % len(results))
def classifyResults(request): timeS = time.time() request.session['searchQuery'] = "" resume = request.GET.get('resumeInput', None) indexes = resumeSearch(resume) ix = open_dir(settings.WHOOSH_INDEX) parser = MultifieldParser([ "jobtitle", "company", "city", "state", "country", "source", "date", "JD", "url", "latitude", "longitude", "relative_time" ], ix.schema, group=qparser.OrGroup) queryInput = "" for i in indexes: queryInput = queryInput + "job_id:" + i + " OR " # queryExclude = parser.parse("<b>") query = parser.parse(queryInput) searcher = ix.searcher() # results = searcher.search(query, filter=filt) results = searcher.search(query) numResults = len(results) timeE = time.time() timeLapse = timeE - timeS timeLapse = float("{0:.3f}".format(timeLapse)) # print (timeLapse) return render(request, 'classifyResults.html', { "results": results, 'time': timeLapse, 'num': numResults })
def search_video(keyword, page, pagelen): with ix_video.searcher() as searcher: parser = MultifieldParser(['title', 'pinyin_title'], schema=ix_video.schema) q = parser.parse(keyword) results = searcher.search_page(q, pagenum=int(page), pagelen=int(pagelen)) video_list = [] for hit in results: field = hit.fields() info = {} info['gcid'] = field.get('gcid', '') info['movieid'] = field.get('movieid') info['uid'] = field.get('uid') info['cover_height'] = field.get('cover_height') info['cover_width'] = field.get('cover_width') info['cover_url'] = field.get('cover_url') info['duration'] = field.get('duration') info['title'] = field.get('title') info['duration'] = field.get('duration') video_list.append(info) #info['pic'] = field.get('pic') return constants.CODE_OK, video_list, results.total
def query(q='', fields=['content'], **kwargs): """ Query the indexed, looking for a match in the specified fields. Results a tuple of results and an open searcher object. """ # Do not preform any queries if the index does not exist. if not index_exists(): return [] ix = init_index() searcher = ix.searcher() profile_score = FieldFacet("author_score", reverse=True) post_type = FieldFacet("type") thread = FieldFacet('thread_votecount') content_length = FieldFacet("content_length", reverse=True) rank = FieldFacet("rank", reverse=True) default = ScoreFacet() # Splits the query into words and applies # and OR filter, eg. 'foo bar' == 'foo OR bar' orgroup = OrGroup # Sort by: toplevel, match score, author reputation, post rank. # sort_by = [post_type, profile_score, rank, default] # sort_by = [post_type] # sort_by = [profile_score] # sort_by = [rank] # sort_by = [thread] sort_by = [post_type, default, content_length] # sort_by = [content_length] parser = MultifieldParser(fieldnames=fields, schema=ix.schema, group=orgroup).parse(q) results = searcher.search(parser, sortedby=sort_by, limit=settings.SEARCH_LIMIT, terms=True, **kwargs) # Allow larger fragments results.fragmenter.maxchars = 100 # results.fragmenter.charlimit = None # Show more context before and after results.fragmenter.surround = 100 return results
def search(self, text, table, limit=10): """ Searches the index for anything containing the text. """ schema = self.__get_schema() index = self.__get_index(schema, False) # Here we use TF-IDF because that is what our mongo search will use. with index.searcher(weighting=scoring.TF_IDF()) as searcher: names = schema.names() names.remove('prop6') query = MultifieldParser(names, schema=index.schema).parse(text) results = searcher.search(query, limit=None) return self.__build_results(results, table, limit)
class WhooshWrap(): ''' Wrapper class to make Whoosh API a little simpler Initialize by pointing to an existing Whoosh index and specifying searchable fields, Max Results and Timeout Query by running self.doSearch, providing query string, and timeout Results of the last search are stored in the object as Whoosh results object (requires open index to access) and returned as a traditional python dictionary ''' def __init__(self,MSID_index_dir, Searchable,MaxResults=10,Timeout = 0.5): ''' Initializes the wrapper object with ijdex reference and preferences parameter MSID_index_dir = (string) Existing Whoosh Index directory parameter Searchable = (string) List of fieldnames of the index to search parameter MaxResults = (numeric) Maximum # of results to return parameter Timeout = (numeric) Maximum # of seconds to wait before ending search ''' self.ix = index.open_dir(MSID_index_dir) # self.qp = MultifieldParser(Searchable, schema=self.ix.schema) # Search all the specified fields #self.qp = QueryParser(Searchable[0], schema=self.ix.schema) # Search ONLY the first field #self.s = self.ix.searcher(weighting = scoring.Frequency) # Simple Scorer self.s = self.ix.searcher(weighting = scoring.BM25F) # Fancy Scorer c = self.s.collector(limit=MaxResults) # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long... self.c = TimeLimitCollector(c,Timeout) self.Searchable = Searchable self.LastResults = None def doSearch(self,qstring,ReturnFields): ''' Performs a search on the index with the provided query and returns a Dict of results parameter qstring = (string) Search key parameter ReturnFields = (list of strings) List of fieldnames to include in return results. NOTE, may be different than Searchable, but fields must exist in index returnval result_fields = dict of result strings : lists per field, i.e. = result_dict = {'Return Fields 1' : [ list of result strings ], 'Return Fields 2' : [ list of result strings ]....} ''' q = self.qp.parse(qstring) # build query with event-provided search key try: self.s.search_with_collector(q,self.c) except: print("TIMEOUT!") # DEBUG out put to console if we're timing out a lot results = self.c.results() # If we do get a timeout, still return whatever we've got, i.e. partial results self.LastResults = results # ResultsDict ={} for field in ReturnFields: ResultsDict[field] = [] for res in results: ResultsDict[field].append(res[field]) # should check that field is in results return ResultsDict
def mostrar_lista(event): lb.delete(0, END) ix = open_dir(dir_index) with ix.searcher() as searcher: query = MultifieldParser(["titulo", "descripcion"], ix.schema).parse( str(en_tituloDescripcion.get())) results = searcher.search(query) for r in results: lb.insert(END, "Antetitulo: " + r['antetitulo']) lb.insert(END, "Titulo: " + r['titulo']) lb.insert( END, "Fecha publicacion: " + r['fechaPublicacion'].strftime('%Y/%m/%d')) lb.insert(END, "")
def buscar_tarifas_movil(str): query = input("Introduzca una palabra de busqueda: ") ix = open_dir(dir_in, indexname="indice_tarifasMovil") with ix.searcher() as searcher: myquery = MultifieldParser([ "internet_movil", "coste_mensual", "minutos", "promociones", "tipo" ], ix.schema).parse(query) results = searcher.search(myquery) for r in results: print("Nombre: " + r['nombre']) print("Minutos: " + r['minutos']) print("Internet Movil: " + r['internet_movil']) print("Promociones: " + r['promociones']) print("Coste Mensual: " + r['coste_mensual']) print("Tipo: " + r['tipo']) print("")
def search(self, ix=None): if ix is None: ix = open_dir(self.dir_name) self.search(ix) self.searcher = ix.searcher() fields = [] qs = '' if self.Index is True: if self.word is not None and len(self.word) > 0: qs += u' index_letter:({0})'.format(self.word) fields.append("index_letter") else: qs += u' verb_form:({0})'.format(self._word) self.query = MultifieldParser(fields, ix.schema).parse(qs)
def preform_whoosh_search(query, fields=None, **kwargs): """ Query the indexed, looking for a match in the specified fields. Results a tuple of results and an open searcher object. """ # Do not preform search if the index does not exist. if not index_exists() or len(query) < settings.SEARCH_CHAR_MIN: return [] fields = fields or ['content', 'title'] ix = init_index() searcher = ix.searcher() # profile_score = FieldFacet("author_score", reverse=True) # post_type = FieldFacet("type") # thread = FieldFacet('thread_votecount') # # content_length = FieldFacet("content_length", reverse=True) # rank = FieldFacet("rank", reverse=True) # default = ScoreFacet() # Splits the query into words and applies # and OR filter, eg. 'foo bar' == 'foo OR bar' orgroup = OrGroup # sort_by = sort_by or [post_type, rank, thread, default, profile_score] # sort_by = [lastedit_date] parser = MultifieldParser(fieldnames=fields, schema=ix.schema, group=orgroup).parse(query) results = searcher.search(parser, limit=settings.SEARCH_LIMIT, terms=True, **kwargs) # Allow larger fragments results.fragmenter.maxchars = 100 # results.fragmenter.charlimit = None # Show more context before and after results.fragmenter.surround = 100 # Sort results by last edit date. results = sorted(results, key=lambda x: x['lastedit_date'], reverse=True) logger.info("Preformed index search") return results