def get_video_ids(query): levenshtein_distance = 1 index = open_dir(corpus_index_dir) query_terms = query.split(" ") fuzzy_query_terms = [ "{0}~{1}".format(qt, levenshtein_distance) for qt in query_terms ] fuzzy_query_terms = " ".join(fuzzy_query_terms) fuzzy_or_query_parser = qparser.QueryParser("content", index.schema, group=qparser.OrGroup) fuzzy_or_query_parser.add_plugin(qparser.FuzzyTermPlugin()) fuzzy_parsed_or_query = fuzzy_or_query_parser.parse(fuzzy_query_terms) fuzzy_and_query_parser = qparser.QueryParser("content", index.schema, group=qparser.AndGroup) fuzzy_and_query_parser.add_plugin(qparser.FuzzyTermPlugin()) fuzzy_parsed_and_query = fuzzy_and_query_parser.parse(fuzzy_query_terms) fuzzy_query_parser = Or([fuzzy_parsed_or_query, fuzzy_parsed_and_query]) with index.searcher(weighting=scoring.TF_IDF()) as searcher: results = searcher.search(fuzzy_query_parser, limit=None) video_ids = [result.fields()["title"] for result in results] return video_ids
def n_gram_query(self, query_string): og = qparser.OrGroup.factory(0.8) parser = qparser.QueryParser(_N_GRAM_FIELD, self._schema, group=og) parser.remove_plugin_class(qparser.FieldsPlugin) parser.remove_plugin_class(qparser.WildcardPlugin) parser.add_plugin(qparser.FuzzyTermPlugin()) return parser.parse(query_string)
def task_frequency_in_index(self) -> list: """ Ermittelt die häufigsten würde für Absätze der Klasse "Ihe Aufgaben" """ task_search_string = \ "alltag arbeitsgebiet~ are bringen^(-0.5) (aufgabe~ aufgabenbereich~ aufgabenbeschreibung aufgabenfeld " \ "aufgabengebiet~ aufgabenschwerpunkt~ aufgabenspektrum)^2 bietest challenge chance dein~ dich" \ "einsatz engagement^0.75 erwartet field (hauptaufgaben~2/12 haupttätigkeit~2/14)^2" \ "herausforderung ihr~ machst meine (responsibilitie~2/15)^1.5 schwerpunkt~ sie " \ "(task~/4)^3 themengebiet~2/12 tun umfasst unterstützen (verantwortlichkeit~2/18 verantwortung~2/13)^2" \ "wirkungsfeld work workspace you~2/3" or_group = qparser.OrGroup.factory(1) parser = qparser.QueryParser("paragraph_heading", schema=self.schema, group=or_group) parser.add_plugin(qparser.FuzzyTermPlugin()) try: heading_searcher = self.ix.searcher() result_docs = heading_searcher.search( parser.parse(task_search_string), limit=None) return self.text_term_frequency(result_docs) except ZeroDivisionError: return ["Für diese Anfrage waren zu wenig Dokumente vorhanden", 0] finally: self.ix.searcher().close()
def requirements_frequency_in_index(self) -> list: """ Ermittelt die häufigsten würde für Absätze der Klasse "Ihre Qualifikationen" """ requirements_search_string = \ "(anforderung~2/5 anforderungsprofil)^2 anwenderkenntnisse~ are ausmacht auszeichnet background " \ "bedingungen berufserfahrung~2 bietest bist (bringen~2/5)^2 dein du (einstellungsvoraussetzungen~2/25)^3 " \ "(erfahrungen~2/9)^1.5 erforderlich~ erwarten (erwartungen~/5)^1.25 essential~ experiences~/7 fachgebiet" \ "fachliche~/8 fachrichtung fähigkeiten^2 hast have ich ihr~ kannst kenntnisse~/8 (kompetenz~2/9)^2 " \ "kompetenzprofil meine mitbringen mitbringst optimal~3/7 (pluspunkt~)^1.25 profil~2/4 punkten " \ "(qualifications~2/13 qualifikation~2/13)^2 reference required requirements^2 sich skills^3 sollten" \ "solltest steckbrief (stellenanforderung~2/18)^2 talent ticken (voraussetzung~2/13 vorkenntniss~2/12)^2" \ "wonach worauf you~2/3 zusätzliche" or_group = qparser.OrGroup.factory(1) parser = qparser.QueryParser("paragraph_heading", schema=self.schema, group=or_group) parser.add_plugin(qparser.FuzzyTermPlugin()) try: heading_searcher = self.ix.searcher() result_docs = heading_searcher.search( parser.parse(requirements_search_string), limit=None) return self.text_term_frequency(result_docs) except ZeroDivisionError: return ["Für diese Anfrage waren zu wenig Dokumente vorhanden", 0] finally: self.ix.searcher().close()
def benefits_frequency_in_index(self) -> list: """ Ermittelt die häufigsten würde für Absätze der Klasse "Ihe Vorteile" """ benefits_search_string = \ "angebot are attraktiv (bekommen bekommst benefit~ bieten)^2 dein~ dich dir^1.5 freuen" \ "geboten^1.5 ihnen ihr~ erwarten kannst konditionen leistungen mehrwert mein mitarbeitervorteile^3 " \ "offer our^1.5 perks^2 salary sie ticken uns unser~/4 unternehmensprofil (vorteil~/7)^2 wir worauf freuen" \ "you~2/3 zusatzleistungen^2 zusätzliche perspective~2/6" or_group = qparser.OrGroup.factory(1) parser = qparser.QueryParser("paragraph_heading", schema=self.schema, group=or_group) parser.add_plugin(qparser.FuzzyTermPlugin()) try: heading_searcher = self.ix.searcher() result_docs = heading_searcher.search( parser.parse(benefits_search_string), limit=None) return self.text_term_frequency(result_docs) except ZeroDivisionError: return ["Für diese Anfrage waren zu wenig Dokumente vorhanden", 0] finally: self.ix.searcher().close()
def indexquery(name, www): if name == None: return [] #print("Name: %s" % name) ix = index.open_dir("/var/www/restnames/index") qp = MultifieldParser([ "commonname", "database", "tags", "name", "name_part", "country", "project", "url" ], schema=ix.schema, termclass=FuzzyTerm) qp.add_plugin(qparser.FuzzyTermPlugin()) q = qp.parse(name) #q = Every() tempvar = [] with ix.searcher() as searcher: results = searcher.search(q, limit=None) for hit in results: tempvar.append({ 'name': hit["name"], 'commonname': hit["commonname"], 'url': hit["url"] }) if not www: return tempvar else: response = Response( render_template("searchresults.html", resultlist=tempvar)) response.headers['content-type'] = 'text/html' return response
async def search(query_str, ctx): ix = open_dir("indexdir") parser = QueryParser("content", ix.schema) parser.add_plugin(qparser.FuzzyTermPlugin()) parser.add_plugin(GtLtPlugin()) parser.add_plugin(DateParserPlugin()) query = parser.parse(query_str) print(query) with ix.searcher(weighting=scoring.PL2) as searcher: results = searcher.search(query, limit=5) results.fragmenter = highlight.SentenceFragmenter() results.fragmenter.surround = 50 results.fragmenter.maxchars = 10000 results.formatter = DiscordBoldFormatter() embed = discord.Embed( title="Results", color=discord.Color(0x3cd63d), description="From search: **{}**".format(query_str)) for hit in results: # embed.add_field(name="[{}]({})".format(hit["title"], hit["url"]), value="{}".format(hit.highlights("content"))) embed.add_field(name="\u200b", value=f"[{hit['title']}]({hit['url']})\n" f"{hit.highlights('content', minscore=0)}", inline=False) await ctx.send(embed=embed)
def _search_query(search_string, index_dir, search_field): search_index = index.open_dir(index_dir) searcher = search_index.searcher() query_parser = qparser.QueryParser(search_field, schema=search_index.schema) query_parser.add_plugin(qparser.PrefixPlugin()) query_parser.add_plugin(qparser.FuzzyTermPlugin()) results = searcher.search(query_parser.parse(search_string), limit=20) return [dict(result) for result in results]
def _mk_parser(self): from whoosh import qparser as qparse # use whoosh default query parser for now parser = qparse.QueryParser("meta", schema=self.idx_obj.schema) parser.add_plugin(qparse.FuzzyTermPlugin()) parser.remove_plugin_class(qparse.PhrasePlugin) parser.add_plugin(qparse.SequencePlugin()) self.parser = parser
def search(self, string=None, fields=["title", "content"]): query_parser = qparser.MultifieldParser(fields, self.ix.schema, group=qparser.OrGroup) query_parser.remove_plugin_class(qparser.PhrasePlugin) query_parser.add_plugin(qparser.FuzzyTermPlugin()) query_parser.add_plugin(qparser.SequencePlugin()) with self.ix.searcher(weighting=scoring.BM25F) as searcher: pattern = query_parser.parse(u'"{}"'.format(string)) for result in searcher.search(pattern, limit=None): yield result
def _mk_parser(self): from whoosh import qparser as qparse parser = qparse.MultifieldParser(self.idx_obj.schema.names(), self.idx_obj.schema) # XXX: plugin is broken in Debian's whoosh 2.7.0-2, but already fixed # upstream parser.add_plugin(qparse.FuzzyTermPlugin()) parser.add_plugin(qparse.GtLtPlugin()) parser.add_plugin(qparse.SingleQuotePlugin()) # replace field defintion to allow for colons to be part of a field's name: parser.replace_plugin( qparse.FieldsPlugin(expr=r"(?P<text>[()<>.\w]+|[*]):")) self.parser = parser
def basic_search(query, query_parse, group=default_group, facet=default_facet, index=default_index): searcher = index.searcher() parser = QueryParser(query_parse, index.schema, group=group) myquery = parser.parse(query) parser.remove_plugin_class(qparser.PhrasePlugin) parser.add_plugin(qparser.SequencePlugin()) parser.add_plugin(qparser.FuzzyTermPlugin()) results = searcher.search( myquery, limit=None, sortedby=facet) # limit为搜索结果的限制,默认为10,详见博客开头的官方文档 print(results) return results
def index_search(self, search_query): if '/' in search_query: return [] search_query = [token.text for token in my_analyzer(search_query)] search_query = '~ '.join(search_query) search_query += '~' ix=index.open_dir("index") with ix.searcher(weighting=scoring.Frequency) as s: og = qparser.OrGroup.factory(0.8) qp = qparser.QueryParser("name", schema=ix.schema, termclass=MyFuzzyTerm, group=og) qp.add_plugin(qparser.FuzzyTermPlugin()) qp.add_plugin(qparser.SequencePlugin()) q = qp.parse(search_query) results = s.search(q, terms=True,limit=None) list=[] for res in results: # list.append(res['name']) list.append(res['id']) return list
def question_tokens_to_query(keywords): """ From a list of keywords and its synonym, transform to whoosh-defined query format """ # Build query from keywords query_str = "" for keyword in keywords: keywords_str = "(" for i in range(len(keyword)): keywords_str += keyword[i] + " OR " keywords_str = keywords_str[:-4] # Remove the last " OR " keywords_str += ")" query_str += keywords_str + " " # From query string build whoosh-defined query ix = index.open_dir(index_dir) parser = qparser.MultifieldParser(["title", "content"], ix.schema) parser.remove_plugin_class(qparser.PhrasePlugin) parser.add_plugin(qparser.SequencePlugin()) # For complex pharse query parser.add_plugin(qparser.FuzzyTermPlugin() ) # Search for term that dont have to match exactly query = parser.parse(query_str) return query
def __init__(self, index_path, language): from whoosh import index as whoosh_index from whoosh.fields import Schema, TEXT, ID from whoosh import qparser from whoosh.highlight import UppercaseFormatter from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer from whoosh.lang import has_stemmer, has_stopwords import os if not has_stemmer(language) or not has_stopwords(language): # TODO Display a warning? analyzer = SimpleAnalyzer() else: analyzer = LanguageAnalyzer(language) self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer)) self.formatter = UppercaseFormatter() self.index_path = index_path if not os.path.exists(index_path): try: os.mkdir(index_path) except OSError as e: sys.exit("Error creating Whoosh index: %s" % e) if whoosh_index.exists_in(index_path): try: self.search_index = whoosh_index.open_dir(index_path) except whoosh_index.IndexError as e: sys.exit("Error opening whoosh index: {0}".format(e)) else: self.search_index = whoosh_index.create_in(index_path, self.schema) self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema) self.query_parser.add_plugin(qparser.FuzzyTermPlugin())
def search(self, str, afterYear, beforeYear, withDir, fuzzycheck): # Open index to search, and create a Query Parser index to search name and descriptions. if(fuzzycheck == "checked"): print("fuzzycheck is checked") movies = list() with self.indexer.searcher() as s: parser = qparser.QueryParser("title", schema=self.indexer.schema) parser.add_plugin(qparser.FuzzyTermPlugin()) q = parser.parse(str) # Use with so searcher is automatically closed afterwards. results = s.search(q, limit=None) # Get results of search with query, if no results state that. if(len(results) == 0): results = s.documents() for r in results: # Take the results and make sure they fit the advanced search flags if((int(r["year"]) > int(afterYear)) and (int(r["year"]) < int(beforeYear)) and (withDir in r["director"])): pster = "https://lascrucesfilmfest.com/wp-content/uploads/2018/01/no-poster-available-737x1024.jpg" if(r["poster"] != "N/A"): pster = (r["poster"]) movies.append(Movie(pster, r["url"], r["title"], r["year"], r["director"], r["plot"])) return movies
def search_for_results(userquery, corrected_flag=True): try: if os.path.exists(settings.SEARCH_INDEX_DIR): # open index directory and create object for searcher class index_reference = open_dir(settings.SEARCH_INDEX_DIR) searcher = index_reference.searcher() # Applying stemming on the userquery stem(userquery) # OrGroup.factory - which is useful better for giving relavance rather # than naive term frequency of the words in the query og = qparser.OrGroup.factory(0.9) # initializing Multifield Parser for searching in the multiple fields queryparser = qparser.MultifieldParser( ["tags", "foss", "title", "outline"], schema=index_reference.schema, group=og) # These Plugins will remove the ability of the user to specify fields to search queryparser.remove_plugin_class(qparser.FieldsPlugin) # To remove the ability to search for wildcards, which can be harmful to query performance queryparser.remove_plugin_class(qparser.WildcardPlugin) # can specify a fuzzy term by adding a ~ followed by an optional maximum edit distance (Ex : jav~1) queryparser.add_plugin(qparser.FuzzyTermPlugin()) # Parse the Given Query q = queryparser.parse(userquery) # For Correcting Spelling with maximum edit distance 3. More than 3 It may affect the performance. corrected = searcher.correct_query(q, userquery, maxdist=3) # if the corrected query is not matched with the parsed query then it will ask for Did you mean option # if the user Entered the query is equal to the suggested query then it will search for the suggested query # else the original query of the is user is searched corrected_string = None if corrected_flag: if corrected.query != q: corrected_string = corrected.string results = searcher.search(q, terms=True, limit=None) # printing the no.of videos found and their title of the video print(("%d Videos Found for %s " % (results.scored_length(), userquery))) if (results.has_matched_terms() and results.scored_length() > 0): collection = [] for hit in results: row = TutorialResource.objects.filter( tutorial_detail_id=hit['VideoId'], language__name='English').first() collection.append(row) return collection, corrected_string else: return None, corrected_string # finally close the searcher object finally: searcher.close() return None, None
i_d.schema, group=og) q_e = MultifieldParser(["title", "content", "extension", "url"], i_e.schema, group=og) q_f = MultifieldParser(["title", "content", "extension", "url"], i_f.schema, group=og) elif operator == 4: #print ("in oper 4") og = qparser.OrGroup.factory(0.9) q_a = MultifieldParser(["title", "content", "tags", "extension", "url"], i_a.schema, group=og) q_a.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?")) q_a.add_plugin(qparser.FuzzyTermPlugin()) q_b = MultifieldParser(["title", "content", "extension", "url"], i_b.schema, group=og) q_b.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?")) q_b.add_plugin(qparser.FuzzyTermPlugin()) q_c = MultifieldParser(["title", "content", "extension", "url"], i_c.schema, group=og) q_c.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?")) q_c.add_plugin(qparser.FuzzyTermPlugin()) q_d = MultifieldParser(["title", "content", "extension", "url", "url"], i_d.schema, group=og) q_d.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?")) q_d.add_plugin(qparser.FuzzyTermPlugin())
def __call__(query=None, dataset=None, force_reindex=False, max_nresults=20, show_keys=False, show_query=False): from whoosh import qparser as qparse try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: for r in _search_from_virgin_install(dataset, query): yield r return # where does the bunny have the eggs? index_dir = opj(ds.path, get_git_dir(ds.path), 'datalad', 'search_index') idx_obj = _get_search_index(index_dir, ds, force_reindex) if show_keys: definitions_fname = opj(index_dir, 'datalad_term_definitions.json.gz') try: defs = jsonload(gzopen(definitions_fname)) except Exception as e: lgr.warning( 'No term definitions found alongside search index: %s', exc_str(e)) defs = {} for k in idx_obj.schema.names(): print('{}{}'.format( k, ' {}'.format(defs[k] if isinstance(defs[k], dict) else '({})'.format(defs[k])) if k in defs else '')) return if not query: return with idx_obj.searcher() as searcher: # parse the query string, default whoosh parser ATM, could be # tailored with plugins parser = qparse.MultifieldParser(idx_obj.schema.names(), idx_obj.schema) # XXX: plugin is broken in Debian's whoosh 2.7.0-2, but already fixed # upstream parser.add_plugin(qparse.FuzzyTermPlugin()) parser.add_plugin(qparse.GtLtPlugin()) # replace field defintion to allow for colons to be part of a field's name: parser.replace_plugin( qparse.FieldsPlugin(expr=r"(?P<text>[()<>:\w]+|[*]):")) # for convenience we accept any number of args-words from the # shell and put them together to a single string here querystr = ' '.join(assure_list(query)) # this gives a formal whoosh query wquery = parser.parse(querystr) if show_query: print(wquery) return # perform the actual search hits = searcher.search( wquery, terms=True, limit=max_nresults if max_nresults > 0 else None) # cheap way to get an approximate number of hits, without an expensive # scoring of all items # disabled: unreliable estimate, often confusing #nhits = hits.estimated_min_length() # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults)) lgr.info('Query completed in {} sec.{}'.format( hits.runtime, ' Reporting {}.'.format(( 'up to ' + topstr) if max_nresults > 0 else 'all matches') if not hits.is_empty() else ' No matches.')) if not hits: return nhits = 0 for hit in hits: res = dict( action='search', status='ok', logger=lgr, refds=ds.path, # normpath to avoid trailing dot path=normpath(opj(ds.path, hit['path'])), query_matched={ assure_unicode(k): assure_unicode(v) if isinstance( v, unicode_srctypes) else v for k, v in hit.matched_terms() }, metadata={ k: v for k, v in hit.fields().items() if k not in ('path', 'parentds') }) if 'parentds' in hit: res['parentds'] = normpath(opj(ds.path, hit['parentds'])) yield res nhits += 1 if max_nresults and nhits == max_nresults: lgr.info("Reached the limit of {}, there could be more which " "were not reported.".format(topstr))
def query(my_query): schema = Schema(href=ID(stored=True), title=TEXT(field_boost=2.0, stored=True), page_content=TEXT(analyzer=StemmingAnalyzer(), stored=True)) ix = index.open_dir("index_dir") # qp = QueryParser("page_content", schema=ix.schema) mparser = qparser.MultifieldParser(["title", "page_content"], schema=schema, group=qparser.OrGroup) my_query_new = "" ff_q = [] final_my_query_new = "" analyzer = StandardAnalyzer() for t in analyzer(my_query): # print(t.text) my_query_new += " " + str(t.text) ff_q.append(str(t.text)) my_stop_words = [ "when", "http", "all", "but", "how", "so", "which", "has", "is", "it", "do", "than", "some", "what", "was", "class", "my", "there", "both" "would", "even" ] for words in ff_q: if words not in my_stop_words: (words) final_my_query_new += " " + str(words) q = mparser.parse(final_my_query_new) mparser.add_plugin(qparser.FuzzyTermPlugin()) final_link = [] final_highlights = [] with ix.searcher() as s: results = s.search(q, limit=10) results.fragmenter.surround = 50 # print(results) for r in results: final_link.append(r['href']) final_highlights.append(str(r.highlights("page_content"))) return final_link, final_highlights # # def search(request): # cur_dir = os.path.normpath('test_xlsx_file_here') # # for sub_dir, dirs, files in os.walk(cur_dir): # for file in files: # if file.endswith(".xlsx") and not file.startswith("~"): # name = os.path.join(sub_dir, file) # wb = load_workbook(str(name)) # ws = wb.get_sheet_by_name(wb.get_sheet_names()[0]) # type = [] # text_code = [] # for i in range(2, ws.max_row + 1): # temp = "" # if ws.cell(row=i, column=2).value != None: # temp = temp + str(ws.cell(row=i, column=2).value) # if ws.cell(row=i, column=3).value != None: # temp = temp + str(ws.cell(row=i, column=3).value) # type.append(str(ws.cell(row=i, column=1).value)) # text_code.append(temp) # #create_index() # final_l1, final_h1 = query(text_code[0]) # final_l2, final_h2 = query(text_code[1]) # final_l3, final_h3 = query(text_code[2]) # final_l4, final_h4 = query(text_code[3]) # final_l5, final_h5 = query(text_code[4]) # final_l6, final_h6 = query(text_code[5]) # final_l7, final_h7 = query(text_code[6]) # final_l8, final_h8 = query(text_code[7]) # final_l9, final_h9 = query(text_code[8]) # final_l10, final_h10 = query(text_code[9]) # # # context = { # 'type': type, # 'text_code': text_code, # 'final_link1': final_l1, # 'final_link2': final_l2, # 'final_link3': final_l3, # 'final_link4': final_l4, # 'final_link5': final_l5, # 'final_link6': final_l6, # 'final_link7': final_l7, # 'final_link8': final_l8, # 'final_link9': final_l9, # 'final_link10': final_l10, # 'final_h1': final_h1, # 'final_h2': final_h2, # 'final_h3': final_h3, # 'final_h4': final_h4, # 'final_h5': final_h5, # 'final_h6': final_h6, # 'final_h7': final_h7, # 'final_h8': final_h8, # 'final_h9': final_h9, # 'final_h10': final_h10, # # } # return render(request, 'index.html', context)