def test_correct_query(): schema = fields.Schema(a=fields.TEXT(spelling=True), b=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(a=u("alfa bravo charlie delta")) w.add_document(a=u("delta echo foxtrot golf")) w.add_document(a=u("golf hotel india juliet")) w.add_document(a=u("juliet kilo lima mike")) w.commit() s = ix.searcher() qp = QueryParser("a", ix.schema) qtext = u('alpha ("brovo november" OR b:dolta) detail') q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__() == '(a:alfa AND (a:"bravo november" OR b:dolta) AND a:detail)' assert c.string == 'alfa ("bravo november" OR b:dolta) detail' qtext = u('alpha b:("brovo november" a:delta) detail') q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)' assert c.string == 'alfa b:("brovo november" a:delta) detail' hf = highlight.HtmlFormatter(classname="c") assert c.format_string(hf) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
def search(self, query, page= -1, page_size=10): search_results = list() qp_artist = QueryParser('title', self.artist_searcher.schema) query = qp_artist.parse(unicode(query)) if page < 1: artist_hits = self.artist_searcher.search(query, limit=None, sortedby='title') else: artist_hits = self.artist_searcher.search(query, page, page_size, sortedby='title') for hit in artist_hits: search_results.append(SearchResult(self._artist_from_document(hit), Type.ARTIST)) qp_album = QueryParser('title', self.album_searcher.schema) query = qp_album.parse(unicode(query)) if page < 1: album_hits = self.album_searcher.search(query, limit=None, sortedby='title') else: album_hits = self.album_searcher.search(query, page, page_size, sortedby='title') for hit in album_hits: search_results.append(SearchResult(self._album_from_document(hit), Type.ALBUM)) qp_track = QueryParser('title', self.track_searcher.schema) query = qp_track.parse(unicode(query)) if page < 1: track_hits = self.track_searcher.search(query, limit=None, sortedby='title') else: track_hits = self.track_searcher.search(query, page, page_size, sortedby='title') for hit in track_hits: search_results.append(SearchResult(self._track_from_document(hit), Type.TRACK)) return search_results
def test_correct_query(): schema = fields.Schema(a=fields.TEXT(), b=fields.TEXT) with TempIndex(schema) as ix: with ix.writer() as w: w.add_document(a=u"alfa bravo charlie delta") w.add_document(a=u"delta echo foxtrot golf") w.add_document(a=u"golf hotel india juliet") w.add_document(a=u"juliet kilo lima mike") with ix.searcher() as s: qp = QueryParser("a", ix.schema) qtext = u'alpha ("brovo november" OR b:dolta) detail' q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) cq = c.query assert isinstance(cq, query.And) assert cq[0].text == "alfa" assert isinstance(cq[1], query.Or) assert isinstance(cq[1][0], query.Phrase) assert cq[1][0].words == ["bravo", "november"] qtext = u'alpha b:("brovo november" a:delta) detail' q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)' assert c.string == 'alfa b:("brovo november" a:delta) detail' hf = highlight.HtmlFormatter(classname="c") assert c.format_string(hf) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
def test_wildcard_existing_terms(): s = fields.Schema(key=fields.ID, value=fields.TEXT) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta")) w.add_document(key=u("a"), value=u("boggle echo render rendering renders")) w.commit() r = ix.reader() qp = QueryParser("value", ix.schema) def words(terms): z = [] for t in terms: assert t[0] == "value" z.append(t[1]) return " ".join(sorted(z)) q = qp.parse(u("b*")) ts = q.existing_terms(r) assert_equal(ts, set()) ts = q.existing_terms(r, expand=True) assert_equal(words(ts), "bear boggle bravo") q = qp.parse(u("[a TO f]")) ts = q.existing_terms(r) assert_equal(ts, set()) ts = q.existing_terms(r, expand=True) assert_equal(words(ts), "alfa bear boggle bravo charlie delta echo") q = query.Variations("value", "render") ts = q.existing_terms(r, expand=False) assert_equal(ts, set()) ts = q.existing_terms(r, expand=True) assert_equal(words(ts), "render rendering renders")
def find(q): ix = Index() parser = QueryParser("content", schema=SCHEMA) print parser.parse(unicode(q)) results = ix.find(q) if len(results): print "Found in %d documents" % len(results) else: print "Not found"
def _search_tag_groups(self, is_filtering_tags): seen = None query_parser = QueryParser("tag", self._index.schema) options = {"limit": None, "groupedby": sorting.FieldFacet("tag", allow_overlap=True), "maptype": sorting.Count} with self._index.searcher() as searcher: total = searcher.search(query_parser.parse("*"), **options).groups() if not is_filtering_tags: seen = searcher.search(query_parser.parse("* AND flags:%s" % Status.SEEN), **options).groups() return seen, total
class WhooshGuess(object): def __init__(self): self.storage = RamStorage() schema = Schema(key=ID(stored=True), \ ask=BOOLEAN(stored=True), \ content=TEXT(stored=True, analyzer=RegexTokenizer())) self.ix = self.storage.create_index(schema) self.writer = self.ix.writer() self.is_train = False for s in greeting.split('\n'): self.train(u'matchinggreeting', s) @property def is_ok(self): return self.is_train def train(self, key, line): splits = u' '.join(list(lang.tokenizezh(line))) ask = lang.is_question(key) #print ask #print splits self.writer.add_document(key=key, content=splits, ask=ask) def train_ok(self): self.writer.commit(optimize=True) self.searcher = self.ix.searcher() self.parser = QueryParser("content", schema=self.ix.schema) self.is_train = True def guess(self, s, is_ask = None): assert(self.is_train) keys = list(lang.keyword(s)) if len(keys) == 0: return '' # MUST contain the keys keys = u' '.join(keys) splits = u' '.join(list(lang.tokenizezh(s))) #q = self.parser.parse(splits + ' OR ' + keys) q1 = self.parser.parse(keys) q2 = self.parser.parse(splits) q = q1 | q2 #print unicode(q) if not is_ask: ask = query.Term(u"ask", lang.is_question(s)) else: ask = query.Term(u"ask", is_ask) results = self.searcher.search(q, filter=ask) for hit in results: return hit['key'] return ''
def update_changeset_index(self): idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME) with idx.searcher() as searcher: writer = idx.writer() writer_is_dirty = False try: indexed_total = 0 repo_name = None for repo_name, repo in self.repo_paths.items(): # skip indexing if there aren't any revs in the repo num_of_revs = len(repo) if num_of_revs < 1: continue qp = QueryParser('repository', schema=CHGSETS_SCHEMA) q = qp.parse(u"last:t AND %s" % repo_name) results = searcher.search(q) # default to scanning the entire repo last_rev = 0 start_id = None if len(results) > 0: # assuming that there is only one result, if not this # may require a full re-index. start_id = results[0]['raw_id'] last_rev = repo.get_changeset(revision=start_id).revision # there are new changesets to index or a new repo to index if last_rev == 0 or num_of_revs > last_rev + 1: # delete the docs in the index for the previous # last changeset(s) for hit in results: q = qp.parse(u"last:t AND %s AND raw_id:%s" % (repo_name, hit['raw_id'])) writer.delete_by_query(q) # index from the previous last changeset + all new ones indexed_total += self.index_changesets(writer, repo_name, repo, start_id) writer_is_dirty = True log.debug('indexed %s changesets for repo %s' % ( indexed_total, repo_name) ) finally: if writer_is_dirty: log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<') writer.commit(merge=True) log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<') else: writer.cancel log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
def GET(self): url = web.input().get('url') qp = QueryParser('url', schema = ix.schema) q = qp.parse(url) r = searcher.search(q, limit = 1) doc = list(r)[0] qp = QueryParser('refers_to', schema = ix.schema) q = qp.parse(url) refs = searcher.search(q, limit = 25) return render.show(doc, refs, DocumentSearcher(ix))
def test_query_terms(): qp = QueryParser("a", None) q = qp.parse("alfa b:(bravo OR c:charlie) delta") assert sorted(q.iter_all_terms()) == [("a", "alfa"), ("a", "delta"), ("b", "bravo"), ("c", "charlie")] q = qp.parse("alfa brav*") assert sorted(q.iter_all_terms()) == [("a", "alfa")] q = qp.parse('a b:("b c" d)^2 e') tokens = [(t.fieldname, t.text, t.boost) for t in q.all_tokens()] assert tokens == [('a', 'a', 1.0), ('b', 'b', 2.0), ('b', 'c', 2.0), ('b', 'd', 2.0), ('a', 'e', 1.0)]
def bm25_retrieve (query, num_res): ix = open_dir('index') searcher = ix.searcher() query_terms = query.split(' ') bool_query = '' for term in query_terms: bool_query += term + ' OR ' parser = QueryParser("content", ix.schema) real_query = parser.parse(bool_query) results = searcher.search(real_query, limit = num_res) new_results = {} res_len = len(results) #assume that top 10 results is relevant ri = {} ni = {} R = 10 N = res_len for term in query_terms: ri[term] = 0 ni[term] = 0 #for each term in the query, calculate its ri and ni for term in query_terms: for res in searcher.search(real_query): if term in res['content']: ri[term] += 1 parser = QueryParser("content", ix.schema) term_query = parser.parse(term) ni[term] = len(searcher.search(term_query, limit = 500)) #for each document, calculate its bm25 score if num_res > 10: for res in results: new_results[res['id']] = 0 for res in results: for term in query_terms: reg = re.compile(term) #fi is the i's term's frequency in the document fi = len(reg.findall(res['content'])) k1 = 1.5 b = 0.75 avdl = 200 K = k1 * (1 - b + b * len(res['content']) / 200) new_results[res['id']] += math.log((ri[term]+0.5)*(N-ni[term]-R+ri[term]+0.5)/(R-ri[term]+0.5)/(ni[term]-ri[term]+0.5)) * (k1 + 1) * fi / (K + fi) return new_results
def contacts(self, query): if query: to = QueryParser('to', self._index.schema) cc = QueryParser('cc', self._index.schema) bcc = QueryParser('bcc', self._index.schema) with self._index.searcher() as searcher: to = searcher.search(to.parse("*%s*" % query), limit=None, groupedby=sorting.FieldFacet('to', allow_overlap=True)).groups() cc = searcher.search(cc.parse("*%s*" % query), limit=None, groupedby=sorting.FieldFacet('cc', allow_overlap=True)).groups() bcc = searcher.search(bcc.parse("*%s*" % query), limit=None, groupedby=sorting.FieldFacet('bcc', allow_overlap=True)).groups() return flatten([to, cc, bcc]) return []
def get_answer(message): if '/' in message[0]: return None rx = r'jova,?\s(.+)$' m = re.match(rx, message) if not m or len(m.groups(1)) < 1: return None global ix search_terms = m.groups(1)[0] parser = QueryParser("content", ix.schema) qry = parser.parse(search_terms) with ix.searcher() as searcher: results = searcher.search(qry) result = None if len(results) == 0: return None if len(results) == 1: result = results[0] else: result = random.choice(results) if result is None or 'path' not in result: return None return result['path'], 'plain-text' return None
def searchNote(self): pattern = self.searchEdit.text() qres = [] with self.ix.searcher() as searcher: queryp = QueryParser("content", self.ix.schema) queryp.add_plugin(RegexPlugin()) query = queryp.parse('r"' + pattern + '"') # r"pattern" is the desired regex term format pathFacet = sorting.FieldFacet("path") scores = sorting.ScoreFacet() results = searcher.search( query, limit=None, sortedby=[pathFacet, scores]) # default limit is 10! for r in results: listItem = QListWidgetItem() title = r['title'] text = r['path'] term = r.highlights("content") qres.append([title, text, term]) html = """ <style> body { font-size: 14px; } .path { font-size: 12px; color: #009933; } </style> """ for ti, te, hi in qres: html += ("<p><a href='" + te + "'>" + ti + "</a><br/><span class='path'>" + te + "</span><br/>" + hi + "</p>") self.searchView.setHtml(html)
def findsnippets(query, daterange=None, page=1,ndocs=PER_PAGE, MAX_SEARCH_RESULTS=MAX_SEARCH_RESULTS,distribution=True): ix = open_dir(indexdir) res=[] daycount={} with ix.searcher() as searcher: parser = QueryParser("content", ix.schema) myquery = parser.parse(query) if daterange!=None: datequery=DateRange("date", daterange[0],daterange[1]) results = searcher.search(datequery & myquery,limit=MAX_SEARCH_RESULTS) else: results = searcher.search(myquery,limit=MAX_SEARCH_RESULTS) if distribution: myfacet=Facets().add_field("date",maptype=sorting.Count) if daterange!=None: datequery=DateRange("date", daterange[0],daterange[1]) daycount_orig=searcher.search(datequery & myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS) else: daycount_orig=searcher.search(myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS) for day in daycount_orig.groups(): daycount[day]=daycount_orig.groups()[day] for result in results[(page-1)*ndocs:page*ndocs]: doc=PoorDoc(docidentifier=result['identifier'],date=int(result['date'].strftime("%Y%m%d"))) snippet=result.highlights("content", text=doc.getcontent()) res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date'],'snippet':snippet}) total_docs=results.estimated_length() return res, total_docs, daycount else: for result in results[(page-1)*ndocs:page*ndocs]: doc=PoorDoc(docidentifier=result['identifier'],date=int(result['date'].strftime("%Y%m%d"))) snippet=result.highlights("content", text=doc.getcontent()) res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date'],'snippet':snippet}) total_docs=results.estimated_length() return res, total_docs
def finddocs(query, daterange=None, page=1,ndocs=PER_PAGE, MAX_SEARCH_RESULTS=MAX_SEARCH_RESULTS,distribution=True): ix = open_dir(indexdir) res=[] daycount={} with ix.searcher() as searcher: parser = QueryParser("content", ix.schema) myquery = parser.parse(query) if distribution: myfacet=Facets().add_field("date",maptype=sorting.UnorderedList) if daterange!=None: datequery=DateRange("date", daterange[0],daterange[1]) results=searcher.search(datequery & myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS) else: results=searcher.search(myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS) doc_cnt=0 for day,docs in results.groups().iteritems(): daycount[day]=len(docs) for result in docs: if doc_cnt in range((page-1)*ndocs,page*ndocs): res.append({'title':searcher.stored_fields(result)['title'],'identifier':searcher.stored_fields(result)['identifier'],'date':searcher.stored_fields(result)['date']}) doc_cnt+=1 total_docs=results.estimated_length() return res, total_docs, daycount else: if daterange!=None: datequery=DateRange("date", daterange[0],daterange[1]) results=searcher.search(datequery & myquery, limit=MAX_SEARCH_RESULTS) else: results=searcher.search(myquery, limit=MAX_SEARCH_RESULTS) for result in results[(page-1)*ndocs:page*ndocs]: res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date']}) total_docs=results.estimated_length() return res, total_docs
def searchNote(self): """ Sorting criteria: "title > path > content" Search matches are organized into html source. """ pattern = self.searchEdit.text() if not pattern: return results = [] print("Searching using", pattern) with self.ix.searcher() as searcher: matches = [] for f in ["title", "path", "content"]: queryp = QueryParser(f, self.ix.schema) queryp.add_plugin(RegexPlugin()) # r"pattern" is the desired regex term format query = queryp.parse('r"' + pattern + '"') ms = searcher.search(query, limit=None) # default limit is 10! for m in ms: if not m in matches: matches.append(m) for r in matches: title = r['title'] path = r['path'] term = r.highlights("content") results.append([title, path, term]) html = "" for title, path, hi in results: html += ("<p><a href='" + path + "'>" + title + "</a><br/><span class='path'>" + path + "</span><br/>" + hi + "</p>") self.searchView.setHtml(html) print("Finished searching", pattern)
def __call__(self, query): """search""" query = unicode(query) query_parser = QueryParser("description", schema=self.ix.schema) myquery = query_parser.parse(query) # Old code: too strict # extendedquery = Or([myquery] + # [Term(field, query) for field in self.keywords]) # New code: too permissive # extendedquery = [myquery] excluded = set(["AND", "OR", "NOT"]) terms = [i for i in query.split() if i not in excluded] # for field in self.keywords: # extendedquery.extend([Term(field, term) for term in terms]) # extendedquery = Or(extendedquery) # Code should look something like # Or([myquery] + [Or( # extendedquery = [myquery] extendedquery = And( [ Or( [myquery] + [Term("description", term), Term("name", term)] + [Term(field, term) for field in self.keywords] ) for term in terms ] ) # perform the search searcher = self.ix.searcher() return [i["name"] for i in searcher.search(extendedquery, limit=None)]
def populateTable(self, searchterm=None): self.infoTable.clear() self.infoTable.setHorizontalHeaderLabels( [ "Title", "Authors", "Tags", "Year", "Read" ] ) self.infoTable.setRowCount(0) self.infoTable.horizontalHeader().setResizeMode(0, QHeaderView.Stretch) self.infoTable.verticalHeader().hide() if( searchterm == None or searchterm == "" ): papers = KKDocument.objects.all() for p in papers: a = ', '.join([x.name for x in p.authors.all()]) t = ', '.join([x.tag for x in p.tags.all()]) self.newEntry(p.title, a, t, p.year, p) return # were done here - all papers printed # only if there is a searchterm: # search full text with whoosh print "FINDING %s" % searchterm searcher = self.whoosh_ix.searcher() parser = QueryParser("content", schema = self.whoosh_schema) query = parser.parse(unicode(searchterm)) whoosh_results = searcher.search(query) print "FOUND", len(whoosh_results), "Objects" for r in whoosh_results: p = KKDocument.objects.get(localFile=r['path']) a = ', '.join([x.name for x in p.authors.all()]) t = ', '.join([x.tag for x in p.tags.all()]) self.newEntry(p.title, a, t, p.year, p)
def OnlyOneSearch(queryStr="",index=".index"): ix=get_index(index) searcher = ix.searcher() parser = QueryParser("name", schema = ix.schema) query=parser.parse(queryStr) results = searcher.search(query) return results
def search_datasets(self, search_phrase, limit=None): """Search for just the datasets.""" from collections import defaultdict from whoosh.qparser import QueryParser parser = QueryParser("doc", schema=self.dataset_index.schema) query = parser.parse(search_phrase) datasets = defaultdict(SearchResult) with self.dataset_index.searcher() as searcher: results = searcher.search(query, limit=limit) for hit in results: vid = hit.get('vid') bvid = hit.get('bvid') type = hit.get('type') datasets[bvid].vid = bvid if type == 'b': datasets[bvid].bundle_found = True datasets[bvid].b_score += hit.score else: datasets[bvid].p_score += hit.score datasets[bvid].partitions.add(vid) return datasets
def get(self): wikiResults = None jobResults = None projectResults = None if 'searchScope' in request.args and 'searchTerm' in request.args: searchTerm = request.args.get('searchTerm') searchScope = request.args.get('searchScope') index = open_dir('app/search/index') parser = QueryParser("content", schema=index.schema) with index.searcher() as searcher: if searchScope in ['everything', 'wiki']: wikiResults = [{'title':result['title'], 'url':'http://jhcwiki.jhc.co.uk/wiki/index.php/' + result['title'].replace(' ', '_')} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'WIKI'] if searchScope in ['everything', 'jobs']: jobResults = [{'title':result['title'], 'url':''} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'JOB'] if searchScope in ['everything', 'projects']: projectResults = [{'title':result['title'], 'url':url_for('projects.projectDetail', projectCode = result['title'].split('-')[0].strip())} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'PROJECT'] else: searchTerm = '' searchScope = 'everything' return render_template('search/search.html', wikiResults=wikiResults, jobResults=jobResults , projectResults=projectResults, searchTerm=searchTerm, searchScope=searchScope, title="Search")
def grid_search(rookie_avg, surround, fragment_char_limit, whoosh_results, corpus, query_string): ''' find best top parameter for whoosh snips. top parameter controls how many ... delimited fragments best = minimize distance w/ average size of rookie snip ''' best = {""} best_distance_so_far = 1000000000 index = open_dir("indexes/{}/".format(corpus)) corpusid = getcorpusid(corpus) for top in range(1, 5): # basically fixed for now with index.searcher() as srch: query_parser = QueryParser("content", schema=index.schema) qry = query_parser.parse(query_string) results = srch.search(qry, limit=None) results.fragmenter.surround = surround results.fragmenter.maxchars = fragment_char_limit sum_ = 0 for s_ix, a in enumerate(results): path = a.get("path").replace("/", "") sents = get_preproc_sentences(path, corpusid) sss = unicode(" ".join(sents).encode("ascii", "ignore")) sss = str(a.highlights("content", text=sss, top=top)) sum_ += len(sss) diff = abs(rookie_avg - sum_/len(results)) print "top of {} gives diff of {}".format(top, diff) if diff < best_distance_so_far: best = top best_distance_so_far = diff print "best top = {}".format(best) return best
def index_query(environ, **kwargs): """ Return a generator of tiddlers that match the provided arguments. """ logging.debug('entering with %s', environ) print 'getting called on index_query' config = environ['tiddlyweb.config'] #store = environ['tiddlyweb.store'] query_parts = [] for field, value in kwargs.items(): if field == 'tag': field = 'tags' query_parts.append('%s:%s' % (field, value)) query_string = ' '.join(query_parts) print 'getting inside on index_query' schema = config.get('wsearch.schema', SEARCH_DEFAULTS['wsearch.schema']) searcher = get_searcher(config) parser = QueryParser('text', schema=Schema(**schema)) query = parser.parse(query_string) logging.debug('query parsed to %s' % query) results = searcher.search(query) def tiddler_from_result(result): print 'r', result bag, title = result['id'].split(':', 1) tiddler = Tiddler(title, bag) return tiddler #return store.get(tiddler) for result in results: yield tiddler_from_result(result) return
def stage3(): ix = open_dir(index_directory) if not ix: print "No index" return parser = QueryParser("content", ix.schema) with ix.searcher() as searcher: try: while True: search_phrase = raw_input('Search phrase: ') if not search_phrase: continue search_phrase = search_phrase.decode(sys.stdin.encoding) myquery = parser.parse(search_phrase) results = searcher.search(myquery) if results: for result in results: print "%s - %s (%s)" % (result['url'],result['title'], result['company']) else: print "No matching results" print "\r\n" except KeyboardInterrupt: print "\nBae..." return
def search(q, default_field="content"): ix = index.open_dir(SEARCH_INDEX) searcher = ix.searcher() parser = QueryParser(default_field, schema=ix.schema) query = parser.parse(q) results = searcher.search(query) return results
def search(request): hits = [] results = [] query = request.GET.get('q', None) newspaper = request.GET.get('newspaper', None) if newspaper is not None: index_dir = "C:/Django Projects/searcher/modules/index" + newspaper ix = index.open_dir(index_dir) searcher = ix.searcher() if query is not None and query != u"": query = query.replace('+', ' AND ').replace(' -', ' NOT ') parser = QueryParser("content", schema=ix.schema) try: qry = parser.parse(query) except: qry = None if qry is not None: hits = searcher.search(qry) for hit in hits: title = hit['title'] url = hit['url'] date = hit['date'] highlights = hit.highlights("content") keywords_list = [keyword for keyword, score in searcher.key_terms_from_text("content", hit['content'])] keywords = ", ".join(keywords_list) results.append(Result(title,url,date,highlights,keywords)) variables = RequestContext(request, { 'query': query, 'hits': results }) return render_to_response('search.html', variables)
def crearEsquemaCorreo(): correo1 = "1.txt" correoEsquema = Schema(remitente=ID(stored=True), destinatarios=KEYWORD(stored=True), fecha=DATETIME(stored=True),asunto=KEYWORD(stored=True), cuerpo=TEXT(stored=True)) if not os.path.exists("indexCorreo"): os.mkdir("indexCorreo") iC = index.create_in("indexCorreo", correoEsquema) iC = open_dir("indexCorreo") writer = iC.writer() fecha = "20101015" date_email = datetime.strptime(fecha, "%Y%m%d") writer.add_document(remitente=u"unoarrobagmail.com", destinatarios=u"dosarrobagmail.com tresarrobagmail.com", fecha=date_email,asunto=u"Contrato de compraventa con la constructora",cuerpo=u"Estimados socios: ya hemos firmado el contrato de compraventa con el cliente preferencial. Espero noticias vuestras. Un saludo,") #writer.add_document(email=u"dosarrobagmail.com", name=u"Pedro Guerra") #writer.add_document(email=u"tresarrobagmail.com", name=u"Ana Montero") #writer.add_document(email=u"cuatroarrobagmail.com", name=u"Luis Pontes") writer.commit() qp = QueryParser("remitente", schema=iC.schema) q = qp.parse(u"unoarrobagmail.com") with iC.searcher() as s: results = s.search(q) print results[0]
def search_files(index_dir, content): """ search file content in index if not hit: return False if hit: return results """ index_exist = index.exists_in(index_dir) if not index_exist: print ("index not exist") return False ix = index.open_dir(index_dir) content = unicode(content) with ix.searcher() as searcher: parser = QueryParser("content", ix.schema) query = parser.parse(content) # whoosh.searching.Results results = searcher.search(query) print (type(results)) l = len(results) print l for h in results: # whoosh.searching.Hit print type(h) print h return results return False
def test_correct_spell_field(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) with TempIndex(schema) as ix: with ix.writer() as w: w.add_document(text=u"rendering shading modeling reactions") with ix.searcher() as s: text = s.schema["text"] spell_text = s.schema["spell_text"] r = s.reader() words = [text.from_bytes(t) for t in r.lexicon("text")] assert words == ["model", "reaction", "render", "shade"] words = [spell_text.from_bytes(t) for t in r.lexicon("spell_text")] assert words == ["modeling", "reactions", "rendering", "shading"] qp = QueryParser("text", s.schema) qtext = u"renderink" q = qp.parse(qtext, s.schema) r = s.search(q) assert len(r) == 0 c = s.correct_query(q, qtext) assert c.string == "rendering" assert c.query == query.Term("text", "rendering") hf = highlight.HtmlFormatter(classname="c") assert c.format_string(hf) == '<strong class="c term0">rendering</strong>'
def search_song_by_title(title, index): results_list = list() qp = QueryParser('title', schema=index.schema) q = qp.parse(u"{}".format(title)) with index.searcher() as searcher: results = searcher.search(q) for result in results: data = { 'title': result['title'], 'artist': result['artist'], 'full_lyrics': result['full_lyrics'], 'lyrics': result['lyrics'], 'album': result['album'] } results_list.append(data) return results_list
def lookup(self, key, field="entity_id"): if key == 'entities' or key is None: return self._entities() key = self._prep_key(key) qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: e = self.objects.get(result['object_id'], None) if e is not None: lst.add(e) return b2u(list(lst))
def testBuscarPalabrasClave(autor): """ Busca las palabras clave del texto que acompaña al cuadro """ ix = whoosh.index.open_dir("ficheros/index") parser = QueryParser("autor", ix.schema) myquery = parser.parse(autor) with ix.searcher() as searcher: results = searcher.search(myquery) print results keywords = [ keyword for keyword, score in results.key_terms("descripcion", numterms=4) ] return keywords
def search(ix, search_key): qp = QueryParser("body", schema=ix.schema, termclass=MyFuzzyTerm) q = qp.parse(search_key) try: with ix.searcher() as s: results = s.search(q) data = [res['data'] for res in results] return { "hits": len(data), "data": data, "error": None, "msg": "Success" } except Exception as e: return {"hits":None, "data":None, "error":str(e),\ "msg":"Invalid search / unknown error"}
def search(query): """Search the stopindex for `query` using Whoosh.""" from info import IDX_NAME ix = open_dir(IDX_NAME) parser = QueryParser("name", ix.schema) q = parser.parse(query) with ix.searcher() as s: results = s.search(q, limit=100) if not results: resp = [] else: resp = [(r['name'], r['location'], r['sid']) for r in results] return resp
def _search(self, query='', field=None, index=None, terms=False, limit=None): ''' query (exaxct mathch) search ''' index = index or self._default_index ix = self.get_index(index) fieldin = field or 'content' qp = QueryParser(fieldin, ix.schema) qp.add_plugin(ws.qparser.SingleQuotePlugin()) query = qp.parse(query, normalize=False) with ix.searcher() as searcher: if terms is True: results = searcher.search(query, terms=True, limit=limit).matched_terms() else: results = list(searcher.search(query, limit=limit).items()) return results
def index(): search_field = request.args.get('search') text = request.args.get('word') if text == None: text = "" else: pass # text = "大学" schema = Schema(title=TEXT(stored=True), content=TEXT, url=STORED, created_at=STORED, inn=STORED, out=STORED) ix = open_dir('daigo_search_dir') t = Tokenizer() str_output = "" for token in t.tokenize(text): str_output += token.surface + " " with ix.searcher() as searcher: query_ = QueryParser("content", ix.schema) query = query_.parse(str_output) results = searcher.search(query) if int(results.estimated_length())>2: pass else: query = QueryParser("content", ix.schema, group=OrGroup).parse(str_output) # OR検索へ results = searcher.search(query) titles = [] timestamps = [] urls = [] oldr ="" for r in results: if r != oldr: titles.append(r['title']) timestamps.append(r['inn']+"~"+r["out"]) urls.append(r["url"]) oldr = r df = pd.DataFrame(columns=['動画タイトル', 'URL','この辺りかも']) df["動画タイトル"] = titles df["この辺りかも"] = timestamps df["URL"] = urls return render_template('search.html',word=text, search_result=df.to_html(classes='books',index=False,justify="center"))
def clusterVars(ind, path): chars = [] # Importance (# lines of dialogue) nameParser = QueryParser("name", schema=ind.schema) for character in lists.characters: query = nameParser.parse(character["Name"][0]) with ind.searcher() as searcher: results = searcher.search(query, limit=None) obj = {} obj["Name"] = character["Name"][0] obj["NumQuotes"] = 0 for result in results: obj["NumQuotes"] += len(result['quote'].split('\n')) chars.append(obj) # Interest in Luke (Luke mentions), Force / Dark Side mentions query1 = Or([Phrase("quote", ["red","five"]),Term("quote","luke"),Term("quote","skywalker")]) query2 = Or([Phrase("quote",["dark","side"]),Term("quote","force")]) with ind.searcher() as searcher: results1 = searcher.search(query1, limit=None) results2 = searcher.search(query2, limit=None, terms=True) for character in chars: character["InterestInLuke"] = 0 character["ForceMentions"] = 0 for result1 in results1: if result1['name'] == character["Name"].upper(): character["InterestInLuke"] += 1 for result2 in results2: if result2['name'] == character["Name"].upper(): character["ForceMentions"] += 1 # Take proportions for character in chars: if character["NumQuotes"] > 0: character["InterestInLuke"] = (character["InterestInLuke"] + 0.0) / character["NumQuotes"] character["ForceMentions"] = (character["ForceMentions"] + 0.0) / character["NumQuotes"] # Character sentiment average charSents = charSentiment(ind, path) for charSent in charSents: for character in chars: if character["Name"] == charSent: character["Sentiment"] = charSents[charSent] return chars
def update_index_emailthreads(self, groupsio_token, config): """ Update the search index using the email archives of groups.io subgroups. This method uses the Groups.io API via methods defined in groupsio_util.py """ # Get the set of indexed ids: # ------ indexed_ids = set() p = QueryParser("kind", schema=self.ix.schema) q = p.parse("emailthread") with self.ix.searcher() as s: results = s.search(q,limit=None) for result in results: indexed_ids.add(result['id']) # Get the set of remote ids: # ------ archive = get_mbox_archives(groupsio_token) writer = self.ix.writer() count = 0 # archives is a dictionary # keys are IDs (urls) # values are dictionaries # Start by collecting all the things remote_ids = set() for k in archives.keys(): remote_ids.add(k) # drop indexed_ids for drop_id in indexed_ids: writer.delete_by_term('id',drop_id) # add remote_ids for add_id in remote_ids: item = archives[add_id] self.add_emailthread(writer, item, config, update=False) count += 1 writer.commit() print("Done, updated %d Groups.io email threads in the index" % count)
def mostrar_lista(event): lb.delete(0, END) ix = open_dir(dir_index) with ix.searcher() as searcher: my_query = str( en_fecha_comienzo.get()) + " TO " + str(en_fecha_fin) qp = QueryParser("fechaPublicacion", ix.schema) q = qp.parse(my_query) results = searcher.search(q) for r in results: lb.insert(END, "Titulo: " + r['titulo']) lb.insert( END, "Fecha publicacion: " + r['fechaPublicacion'].strftime('%Y/%m/%d')) lb.insert(END, "")
def delete(cached): "Remove file from index." ix = open_dir(DIRECTORY, NAME) with ix.searcher() as searcher, ix.writer() as w: qp = QueryParser(u'cached', ix.schema) q = qp.parse(unicode(cached)) results = searcher.search(q) if len(results) == 0: # Should only happen if user hasn't done run skid-update since # adding the paper being deleted. print 'Cached file %r not found in index.' % cached elif len(results) == 1: w.delete_document(results[0].docnum) return True else: assert False, 'This should never happen. ' \ 'Multiple (%s) results for %r found for cached file.' % (len(results), cached)
def test_multi(self): import transaction from whoosh_tm.datamanager import WhooshDataManager from whoosh.qparser import QueryParser from whoosh.index import create_in, open_dir import threading with TempDirectory() as d: ix1 = create_in(d.path, dummy_schema) ix2 = open_dir(d.path) def add_document1(): dm1 = WhooshDataManager(ix1) t = transaction.get() t.join(dm1) dm1.add_document( title=u"First document", path=u"/a", content=u"This is the first document we've added!") transaction.commit() def add_document2(): dm2 = WhooshDataManager(ix2) t = transaction.get() t.join(dm2) dm2.add_document( title=u"Second document", path=u"/b", content=u"The second one is even more interesting!") transaction.commit() th1 = threading.Thread(target=add_document1) th2 = threading.Thread(target=add_document2) th1.start() th2.start() th1.join() th2.join() ix3 = open_dir(d.path) with ix3.searcher() as searcher: parser = QueryParser("content", ix3.schema) results = searcher.search(parser.parse('second')) self.assertEqual(len(results), 1) self.assertEqual(results[0]["title"], "Second document")
def more_like_this(request, pid): ix = index.open_dir(settings.WHOOSH_INDEX) searcher = ix.searcher() qp = QueryParser("pid", schema=ix.schema) qq = qp.parse(pid) doc = searcher.search(qq) first = doc[0] title = "%s: %s" % (first['type'], first['title']) res = first.more_like_this("content", numterms=NUM_TERMS) res = map(decorate, res) ix.close() messages.info(request, 'Posts similar to <b>%s</b>' % title) return res
def ApartadoB(fecha): ix = open_dir("index") qp = QueryParser("fecha", schema=ix.schema) query = unicode("'"+fecha+" to today'") qp.add_plugin(DateParserPlugin(free=True)) q = qp.parse(query) print(q) s=ix.searcher() results = s.search(q) print(results) for n in results: print n.get("fecha") print n.get("remitente") print n.get("destinatarios") print n.get("asunto") print("*************\n") return results
def readData(self, data): results = None # init searcher with self.ix.searcher() as searcher: parser = QueryParser(data["query_field"], self.ix.schema, plugins=[], group=qparser.OrGroup) # parser.add_plugin(qparser.FuzzyTermPlugin()) myquery = parser.parse(data["query_str"]) # print("-=-=-=-=-=-=", parser.filters()) results = [ dict(d_) for d_ in searcher.search(myquery, limit=self.search_limit) ] # print(parser, myquery) return results
def get_wiki_articles(answers, ix): qp = QueryParser("title", schema=ix.schema) #print("Searching For: %s"%answers) q = qp.parse(answers) with ix.searcher() as s: results = s.search(q, limit=1) fname = None if len(results) > 0: for result in results: fname = result['file_path'] title = result['title'] #print("Found: %s"%result['title']) text = get_article_text(fname, title) else: text = None return text
def getMentionGraph(self, coreUsers: list): qp = QueryParser("mentionsUsers", schema=self.ix.schema) tq = qp.parse("*") # something in mentionUsers ret = defaultdict(lambda: defaultdict(int)) with self.getSearcher() as s: for uid in coreUsers: uq = whoosh.query.Term("user", uid) # limit to uid q = whoosh.query.And([uq, tq]) res = s.search(q, limit=10000000) thisUserContrib = ret[uid] for r in res: mentions = r["mentionsUsers"] mentions = mentions.split(",") for m in mentions: thisUserContrib[m] += 1 return ret
def full_text_search(q): ix = open_dir(const.TRANSCRIBED_WHOOSH_INDEX_DIR_PATH) parser = QueryParser("transcript", ix.schema) q = q.decode('utf-8') query = parser.parse(q) results = [] with ix.searcher() as searcher: res = searcher.search(query) for r in res: results.append( (r.fields()["transcription_id"], r.highlights("transcript"))) return results
def search(ix, query, project_ids=[], limit=10): query_parser = QueryParser("name", schema=ix.schema) whoosh_query = query_parser.parse(query) is_project_filter = len(project_ids) > 0 ids = [] with ix.searcher() as searcher: if is_project_filter: project_id_terms = Or( [Term("project_id", project_id) for project_id in project_ids]) results = searcher.search(whoosh_query, filter=project_id_terms, limit=limit) else: results = searcher.search(whoosh_query, limit=limit) for result in results: ids.append(result["id"]) return ids
class HamsterIndex(object): def __init__(self, path): if not os.path.exists(path): os.mkdir(path) self.index = create_in(path, MovieSchema) self.index = open_dir(path) self.q_parser = QueryParser("title", self.index.schema) def index_movie(self, movie): u = unicode cast = movie["cast"] actors = [c['name'] for c in cast] director = movie["director"] directors = [c['name'] for c in cast] writer = self.index.writer() writer.add_document(imdb_id=u(movie["_id"]), title=u(movie["title"]), genre=u(";".join(movie["genres"])), plot=u(movie.get("plot", "")), cast=u(";".join(actors)), director=u(";".join(directors)), rating=movie["rating"]) writer.commit() def list_all(self): num = 0 retval = [] with self.index.searcher() as searcher: for res in searcher.reader().all_stored_fields(): num += 1 retval.append(res) return retval def query(self, querystring): querystring = unicode(querystring) myquery = self.q_parser.parse(querystring) with self.index.searcher() as searcher: results = searcher.search(myquery) retval = [] for res in results: retval.append(res.fields()) return retval
def busquedaTitulos(descripcion): ix = open_dir("index") # print ix.doc_count_all() #esto te dice cuantos has indexado por si no estas seguro # for r in ix.searcher().documents(): # print("entrada: "+str(r)) qp = QueryParser("descripcion", schema=ix.schema) q = qp.parse(unicode(str(descripcion))) s = ix.searcher() results = s.search(q) titulos = [] for r in results: titulos.append(r.get("titulo")) return titulos
def query_article_by_key(key=""): """ 查询文章内容通过key """ if len(key) <= 0: return [] result = list() index = open_dir(INDEX_PATH) searcher = index.searcher() parser = QueryParser("content", schema=index.schema) result_list = searcher.search(parser.parse(key)) for hit in result_list: result.append(hit) return result
def index_search(search_terms, document_root): qp = QueryParser("content", schema=markupserve_index.schema) query = qp.parse(safe_unicode(search_terms)) results = collections.defaultdict(list) split_terms = shlex.split(search_terms) with markupserve_index.searcher() as searcher: query_results = searcher.search(query, limit=None) for result in query_results: filename = result["path"].decode("utf-8").decode("unicode-escape") results[filename].append( result.highlights("content").decode("utf-8").decode( "unicode-escape")) return results
def search(self, searchStr, lang): ''' Search for a code snippet based off a search string (searchStr) and the language s.search("How to append to a list", "Python") ''' with self.ix.searcher() as searcher: qp = QueryParser("description", self.ix.schema, group=OrGroup) # Since the index is singularized, we must search using a # singularized string. query = qp.parse( unicode("(%s) AND (lang:%s)" % (self.singularize(searchStr), lang))) results = searcher.search(query) returnThis = [(x['description'].lower(), x['path']) for x in results] return returnThis
def modify(self, name, metadata): """ Modify a document metadata """ # Let's check if the document exist # It must be present in the index qp = QueryParser("hash", schema=self.index.schema) q = qp.parse(name) with self.index.searcher() as s: # No results if len(s.search(q)) == 0: raise IOError("Document does not exist") # So we do have a document, we just need to update it metadata['hash'] = name # Write everything into the index self.__update_index(metadata)
def test_can_search_id_and_summary_TODO(self): #arrange self.insert_ticket("test x") self.insert_ticket("test 1") fieldboosts = dict( id=1, summary=1, ) mfp = MultifieldPlugin(list(fieldboosts.keys()), ) pins = [WhitespacePlugin, PhrasePlugin, mfp] parser = QueryParser(None, WhooshBackend.SCHEMA, plugins=pins) parsed_query = parser.parse("1") result = self.whoosh_backend.query(parsed_query) self.print_result(result) self.assertEqual(2, result.hits)
def find(self, text): ''' 通过关键字查找一篇文章 :param text: 关键字 :return: 返回一个dict,包括path, title, content ''' searcher = self.ix.searcher() ret_list = [] parser = QueryParser("content", schema=self.ix.schema) try: word = parser.parse(text) except: word = None if word is not None: hits = searcher.search(word, limit=None) for hit in hits: ret_list.append(dict(hit)) return ret_list
def search(self, user_query): '''Search the index for wikis that relate to the user's query.''' # Exchange some speed by searching for variations of what the user # queried for to improve search quality. parser = QueryParser('content', schema=self._ix.schema, termclass=query.Variations) q = parser.parse(u'{0}'.format(user_query)) results = [] with self._ix.searcher() as searcher: hits = searcher.search(q) hits.fragmenter = self._context_fragmenter for hit in hits: results.append( SearchResult(hit['path'], hit.highlights('content'))) return results
def find(self, querystring, parser=None, **kwargs): """Parses querystring, runs the query in this index, and returns a Result object. Any additional keyword arguments are passed to Searcher.search() along with the parsed query. :querystring: The query string to parse and search for. :parser: A Parser object to use to parse 'querystring'. The default is to use a standard qparser.QueryParser. This object must implement a parse(str) method which returns a query.Query instance. :*returns*: searching.Results """ if parser is None: from whoosh.qparser import QueryParser parser = QueryParser(self.schema) return self.searcher().search(parser.parse(querystring), **kwargs)
def search_similar(self, entity, skip=[]): with self.index.searcher() as searcher: qp = QueryParser("fingerprint", schema=self.index.schema) # parser.add_plugin(qparser.FuzzyTermPlugin()) tokens = set() for fp in entity.fingerprints: tokens.update(fp.split()) if entity.country: tokens.add('country:%s' % entity.country) tokens = ' OR '.join(tokens) tokens = ['(%s)' % tokens] for uid in skip: tokens.append('(NOT uid:%s)' % uid) q = ' AND '.join(tokens) q = qp.parse(q) restrict_q = Term("uid", entity.uid) for result in searcher.search(q, mask=restrict_q): yield result.get('uid')