def issue_search(queries_list, return_objects=False, make_phrase=False, case_sensitive=False): # Remove quotation marks queries = [q.replace("'", "").replace('"', '') for q in queries_list] if make_phrase: queries = ["\"" + q + "\"" for q in queries] ix = get_issue_index(case_sensitive=case_sensitive) parser = QueryParser("text", schema=ix.schema) parser.add_plugin(PhrasePlugin()) with ix.searcher() as searcher: parsed_queries = [parser.parse(q) for q in queries] q = whoosh.query.Or(parsed_queries) results = searcher.search(q, limit=None) # print " -", len(results), "results" if return_objects: return [ LobbyingSpecificIssue.query.get(int(i['id'])) for i in results ] else: return [i['id'] for i in results]
def searchNote(self): """ Sorting criteria: "title > path > content" Search matches are organized into html source. """ pattern = self.searchEdit.text() if not pattern: return results = [] print("Searching using", pattern) with self.ix.searcher() as searcher: matches = [] for f in ["title", "path", "content"]: queryp = QueryParser(f, self.ix.schema) queryp.add_plugin(RegexPlugin()) # r"pattern" is the desired regex term format query = queryp.parse('r"' + pattern + '"') ms = searcher.search(query, limit=None) # default limit is 10! for m in ms: if not m in matches: matches.append(m) for r in matches: title = r['title'] path = r['path'] term = r.highlights("content") results.append([title, path, term]) html = "" for title, path, hi in results: html += ("<p><a href='" + path + "'>" + title + "</a><br/><span class='path'>" + path + "</span><br/>" + hi + "</p>") self.searchView.setHtml(html) print("Finished searching", pattern)
def get_html_correction(searcher, query_str, qp): exact_qp = QueryParser('exact', my_index.search_schema) exact_qp.add_plugin(DateParserPlugin()) exact_qp = exact_qp.parse(query_str) try: corrected_query = searcher.correct_query(exact_qp, query_str, prefix=1) except: return "" for token in corrected_query.tokens: # is this some sort of bug with Whoosh? startchar:8, endchar:9 original:'tes?' the hell? if query_str[token.startchar:token.endchar] != token.original: return "" for variations in (uk_variations, us_variations): if token.original in variations and searcher.ixreader.frequency( 'exact', variations[token.original]) > 0: token.text = variations[token.original] break # not sure this code ever gets a chance to run due to above possible bug if re.search(r'\W', token.original): token.text = token.original corrected_query_str = replace_tokens(query_str, corrected_query.tokens) corrected_qp = QueryParser('stemmed', my_index.search_schema) corrected_qp.add_plugin(DateParserPlugin()) corrected_qp = corrected_qp.parse(corrected_query_str) if corrected_qp == qp: return "" result = '<h3>Did you mean <a href="{}">{}</a>?</strong></h3>'.format( stateful_url_for('search_form', q_query=urlize(corrected_query_str)), corrected_query.format_string( highlight.HtmlFormatter(classname="change"))) return result
def search(self, user_query, ranking_function=scoring.BM25F(), phraseSearch=False): qp = QueryParser("body", schema=self.ix.schema) # Once you have a QueryParser object, you can call parse() on it to parse a query string into a query object: # default query lang: # If the user doesn’t explicitly specify AND or OR clauses: # by default, the parser treats the words as if they were connected by AND, # meaning all the terms must be present for a document to match # we will change this # to phrase search "<query>" - use quotes qp.add_plugin(qparser.GtLtPlugin) # qp.remove_plugin_class(qparser.PhrasePlugin) qp.add_plugin(qparser.PhrasePlugin) if phraseSearch == True: user_query = '"' + user_query + '"' query = qp.parse(user_query) print("# user_query", user_query, ", Query: ", query) print(query) with self.ix.searcher(weighting=ranking_function) as searcher: matches = searcher.search(query, limit=None) print("Total Number of Results:", len(matches)) print("Number of scored and sorted docs in this Results object:", matches.scored_length()) results = [item.fields() for item in matches] resultsDF = pandas.DataFrame.from_dict(results) return (matches.docs(), resultsDF)
def searchNote(self): pattern = self.searchEdit.text() qres = [] with self.ix.searcher() as searcher: queryp = QueryParser("content", self.ix.schema) queryp.add_plugin(RegexPlugin()) query = queryp.parse('r"' + pattern + '"') # r"pattern" is the desired regex term format pathFacet = sorting.FieldFacet("path") scores = sorting.ScoreFacet() results = searcher.search( query, limit=None, sortedby=[pathFacet, scores]) # default limit is 10! for r in results: listItem = QListWidgetItem() title = r['title'] text = r['path'] term = r.highlights("content") qres.append([title, text, term]) html = """ <style> body { font-size: 14px; } .path { font-size: 12px; color: #009933; } </style> """ for ti, te, hi in qres: html += ("<p><a href='" + te + "'>" + ti + "</a><br/><span class='path'>" + te + "</span><br/>" + hi + "</p>") self.searchView.setHtml(html)
def query(self, q): # parser qpcontent = QueryParser("fullText", schema=self.indexer.schema) qpanchor = QueryParser("anchorText", schema=self.indexer.schema) qpcontent.add_plugin(qparser.OperatorsPlugin()) qpanchor.add_plugin(qparser.OperatorsPlugin()) # query qcontent = qpcontent.parse(q) qanchor = qpanchor.parse(q) resWeb = [] with self.indexer.searcher() as s: resContent = s.search(qcontent, limit=40) resAnchor = s.search(qanchor, limit=40) resFinal = resAnchor resFinal.upgrade_and_extend(resContent) respgMap = {} resAnchorMap = {} resTextMap = {} resURLMap = {} for r in resFinal: resURLMap[r['pageURL']] = r for r in resFinal: respgMap[r['pageURL']] = self.pgrank[self.allFilesMap[ r['pageURL']]] resAnchorMap[r['pageURL']] = r['anchorText'] resTextMap[r['pageURL']] = r['fullText'] supportedRes = [] for r in respgMap: supportedRes.append((r, respgMap[r])) # consolidate the result by VSMSP algorithm ii = 0 while ii + 10 < len(supportedRes): supportedRes[ii:ii + 10] = sorted(supportedRes[ii:ii + 10], key=operator.itemgetter(1)) ii = ii + 10 supportedRes[ii:] = sorted(supportedRes[ii:], key=operator.itemgetter(1)) for r in supportedRes: hts = self.__cleanhtml(resURLMap[r[0]].highlights("anchorText", top=3)) hts = hts + self.__cleanhtml(resURLMap[r[0]].highlights( "fullText", top=2)) resWeb.append([resURLMap[r[0]]["title"], r[0], hts]) """ print(resURLMap[r[0]]["title"]) print(r[0]) print(r[1]) print(self.__cleanhtml(resURLMap[r[0]].highlights("anchorText",top=3))) print(self.__cleanhtml(resURLMap[r[0]].highlights("fullText",top=2))) print("\n\n\n") """ return resWeb
def getLastNotice(): """ @returns the last added notice to the Index """ ix = open_dir("index") with ix.searcher() as searcher: qp = QueryParser("content", ix.schema, group=OrGroup) qp.add_plugin(DateParserPlugin()) query = qp.parse(u"date:'[18000101 to today]") results = searcher.search(query, limit=1) return results.fields(0)
def searcher(index_path, query): ix = open_dir(index_path) searcher = ix.searcher() parser = QueryParser("content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) my_query = parser.parse(query) results = searcher.search(my_query, limit=None) for result in results: print(result['content'])
def QuerySent(query,schema,SentNum): # SetIndex("TestIndex") # use Fuzzy parser =QueryParser(None,schema) parser.add_plugin(MultifieldPlugin(["title","sent"])) # parser.add_plugin(FuzzyTermPlugin()) myquery = parser.parse(query) results =searcher.search(myquery) acturalResult = "" for rs in results: if rs["sent"] == str(SentNum): return rs["content"]
def findMailbyDate(indexes, input_user): ix = indexes.get("index_emails") qp = QueryParser("date", schema=ix.schema) qp.add_plugin(DateParserPlugin()) date_now = datetime.now() date_input = datetime.strptime(input_user, "%Y%m%d") q = qp.parse(unicode("date:["+str(date_input)+"to"+str(date_now)+"]")) with ix.searcher() as s: results = s.search(q) for result in results: print "Remitente: %s, Destinatarios: %s, Asunto: %s" % (result["sender_email"], result["recipient_emails"], result["subject"])
def search(self,text): parser = QueryParser("data", self.ix.schema,group=qparser.OrGroup) parser.add_plugin(qparser.FuzzyTermPlugin()) parser.add_plugin(qparser.SequencePlugin()) query = parser.parse(text) output=[] with self.ix.searcher() as searcher: results = searcher.search(query,terms=True) for r in results: output.append( { 'icd': { 'name' : r['name'], 'icdcode': r['icdcode'] } } ) return output
def apartado_b(date1,date2): ix = open_dir("Index") dataFromResults = [] with ix.searcher() as searcher: parser = QueryParser("fecha",ix.schema) parser.add_plugin(DateParserPlugin()) query = u"date:[" + date1 + " to " + date2 + "]" print(query) query = parser.parse(query) results = searcher.search(query) for r in results: dataFromResults.append([r["titulo"],r["fecha"]]) return dataFromResults
def searchFe(busqueda): ix = open_dir("index") searcher = ix.searcher() date = "{" + busqueda + " to]" parser = QueryParser("fecha", ix.schema) parser.add_plugin(DateParserPlugin(free=True)) parser.add_plugin(GtLtPlugin()) myquery = parser.parse(date) results = searcher.search(myquery) return results
def search_index(words): xg_duanluo = [] with ix.searcher() as s: qp = QueryParser('duanluo', schema=ix.schema, group=qparser.OrGroup) qp.remove_plugin_class(qparser.WildcardPlugin) qp.add_plugin(qparser.PrefixPlugin()) for word in words: q = qp.parse(u'{}'.format(word)) results = s.search(q, limit=10) for i in results: xg_duanluo.append((i['id'], i['duanluo'])) return xg_duanluo
async def word_count(query_str, ctx): ix = open_dir("indexdir") parser = QueryParser("title", ix.schema) query = parser.parse(query_str) parser.add_plugin(DateParserPlugin()) print(query) with ix.searcher(weighting=scoring.BM25F) as searcher: results = searcher.search(query) embed = discord.Embed(title="Wordcount", color=discord.Color(0x3cd63d)) for hit in results: embed.add_field(name="{}".format(hit["title"]), value="Wordcount: **{}**".format(hit["wordcount"]), inline=False) await ctx.send(embed=embed)
def basic_search(query, query_parse, group=default_group, facet=default_facet, index=default_index): searcher = index.searcher() parser = QueryParser(query_parse, index.schema, group=group) myquery = parser.parse(query) parser.remove_plugin_class(qparser.PhrasePlugin) parser.add_plugin(qparser.SequencePlugin()) parser.add_plugin(qparser.FuzzyTermPlugin()) results = searcher.search( myquery, limit=None, sortedby=facet) # limit为搜索结果的限制,默认为10,详见博客开头的官方文档 print(results) return results
def search_index(words): xg_words = [] with ix.searcher() as s: qp = QueryParser('section', schema=ix.schema, group=qparser.OrGroup) # 可以使用通配符搜索 qp.remove_plugin_class(qparser.WildcardPlugin) qp.add_plugin(qparser.PrefixPlugin()) for word in words: q = qp.parse(u'{}'.format(word)) results = s.search(q, limit=10) for i in results: xg_words.append(i['section']) return xg_words
def _search(self, query='', field=None, index=None, terms=False, limit=None): ''' query (exaxct mathch) search ''' index = index or self._default_index ix = self.get_index(index) fieldin = field or 'content' qp = QueryParser(fieldin, ix.schema) qp.add_plugin(ws.qparser.SingleQuotePlugin()) query = qp.parse(query, normalize=False) with ix.searcher() as searcher: if terms is True: results = searcher.search(query, terms=True, limit=limit).matched_terms() else: results = list(searcher.search(query, limit=limit).items()) return results
def ApartadoB(fecha): ix = open_dir("index") qp = QueryParser("fecha", schema=ix.schema) query = unicode("'"+fecha+" to today'") qp.add_plugin(DateParserPlugin(free=True)) q = qp.parse(query) print(q) s=ix.searcher() results = s.search(q) print(results) for n in results: print n.get("fecha") print n.get("remitente") print n.get("destinatarios") print n.get("asunto") print("*************\n") return results
def search(self, expr, limit=10000): with self._index.searcher() as searcher: query = QueryParser("raw", self._index.schema) query.add_plugin(FieldsPlugin()) query.add_plugin(RangePlugin()) query.add_plugin(GtLtPlugin()) query.add_plugin(WildcardPlugin()) query = query.parse(expr) for x in searcher.search(query, limit=limit): yield x
def search(self, expr, limit=10000): with self._index.searcher() as searcher: query = QueryParser("raw", self._index.schema) query.add_plugin(FieldsPlugin()) query.add_plugin(RangePlugin()) query.add_plugin(GtLtPlugin()) query.add_plugin(WildcardPlugin()) query = query.parse(expr) for x in searcher.search(query, limit=limit): yield x
def match(query_str, idx, limit=40): ret_results = [] query_words = words_get(query_str) if len(query_words) == 0: return ret_results with idx.searcher() as searcher: rome_facet = sorting.FieldFacet('rome') # Strict search, with forced correction parser = QueryParser('label', idx.schema) query = parser.parse(f'{query_str}') cor = searcher.correct_query(query, query_str) results = searcher.search(cor.query, limit=20, collapse=rome_facet) # Word-joker search parser = QueryParser('label', idx.schema) query = parser.parse(f'{query_str}*') results_partial = searcher.search(query, limit=20, collapse=rome_facet) results.upgrade_and_extend(results_partial) # Fuzzy search parser = QueryParser('label', idx.schema, termclass=CustomFuzzyTerm) parser.add_plugin(FuzzyTermPlugin()) shortword = re.compile(r'\W*\b\w{1,3}\b') query_prep = shortword.sub('', query_str) query = parser.parse(query_prep) results_fuzzy = searcher.search(query, limit=limit, collapse=rome_facet) results.upgrade_and_extend(results_fuzzy) for res in results: ret_results.append({ 'id': res['rome'], 'label': res['label'], 'value': res['label'], 'occupation': res['slug'], 'source': res['source'], 'score': res.score }) return sorted(ret_results, key=lambda e: e['score'], reverse=True)
def search_index(words): xg_words = [] with ix.searcher() as s: # group=qparser.OrGroup 表示可匹配任意查询词,而不是所有查询词都匹配才能出结果 qp = QueryParser('section', schema=ix.schema, group=qparser.OrGroup) # 下面两行表示可以使用通配符搜索,如"窗前*月光" qp.remove_plugin_class(qparser.WildcardPlugin) qp.add_plugin(qparser.PrefixPlugin()) for word in words: q = qp.parse(u'%s' % word) # limit:代表返回多少条搜索结果 results = s.search(q, limit=10) for i in results: xg_words.append(i['section']) # print (word,i['section']) return xg_words
def test_custom_fields(self): obj = ObjectD(title=u'title', blurb='this is a blurb') db.session.add(obj) db.session.commit() self.assert_search_result(ObjectD, 'blurb') self.assert_search_result(ObjectD, '/blog/%s' % obj.id, fields=['url']) ### Date ### from whoosh.qparser import QueryParser from whoosh.qparser.dateparse import DateParserPlugin # Instatiate a query parser / add the DateParserPlugin to the parser qp = QueryParser("date", ObjectD.pure_whoosh._index.schema) qp.add_plugin(DateParserPlugin()) self.assert_search_result(ObjectD, qp.parse("created:today")) self.assert_search_no_result(ObjectD, 'what') # Sanity check
def run(args): query = unicode(" ".join(args.query)) ix = open_dir("urla.index") qp = QueryParser("content", ix.schema) qp.add_plugin(DateParserPlugin()) with ix.searcher() as searcher: parsed = qp.parse(query) print parsed results = searcher.search(parsed, sortedby="when", reverse=True, limit=None) for result in results: timestamp = result["when"].strftime("%Y-%m-%d") print "%s %s" % (timestamp, result["content"].encode("utf-8"))
class Query(object): def __init__(self, index_name): self.index_name = index_name self.ix = open_dir(index_name) self.indexer = Indexer("./template/cache/") self.singleParser = QueryParser("word", schema=self.indexer.schema) self.searcher = self.ix.searcher() self.singleParser.add_plugin(EveryPlugin) def translate(self, word, level): query = self.singleParser.parse(word.lower()) query = And([query, Term("level", level)]) results = self.searcher.search(query) for result in results: self.log(word) return "%s [%s]" % ( colorize(word, color='magenta'), colorize(result["translation"], color='white', bold=True)) else: return word def query(self, sentence, level): ll = [[word_tokenize(w), ' '] for w in sentence.split()] words = list(itertools.chain(*list(itertools.chain(*ll)))) str_ = "" for word in words: str_ += self.translate(word, level) return str_ def process(self, filename, level): translated = "" with open(filename, 'r') as f: for line in f.readlines(): translated += self.query(line, level) translated += '\n' return translated def log(self, sentence): with open('logging.txt', 'a') as f: f.write(datetime.datetime.now().strftime("%y%m%d%H%M%S") + " " + sentence)
def test_fuzzy_prefix(): from whoosh import scoring schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: # Match -> first w.add_document(title=u("First"), content=u("This is the first document we've added!")) # No match w.add_document( title=u("Second"), content=u("The second one is even more interesting! filst")) # Match -> first w.add_document(title=u("Third"), content=u("The world first line we've added!")) # Match -> zeroth w.add_document( title=u("Fourth"), content=u("The second one is alaways comes after zeroth!")) # Match -> fire is within 2 edits (transpose + delete) of first w.add_document(title=u("Fifth"), content=u("The fire is beautiful")) from whoosh.qparser import QueryParser, FuzzyTermPlugin parser = QueryParser("content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) q = parser.parse("first~2/3 OR zeroth", debug=False) assert isinstance(q, query.Or) ft = q[0] assert isinstance(ft, query.FuzzyTerm) assert ft.maxdist == 2 assert ft.prefixlength == 3 with ix.searcher(weighting=scoring.TF_IDF()) as searcher: results = searcher.search(q) assert len(results) == 4 assert (" ".join(sorted( hit["title"] for hit in results)) == "Fifth First Fourth Third")
def search_langs(repos, q, limit=1000, **kw): index_ = get_langs_index(repos) qp = QueryParser("ini", schema=index_.schema) qp.add_plugin(GtLtPlugin()) q = '{0} {1}'.format( q, ' '.join('{0}:"{1}"'.format(k, v) for k, v in kw.items())) def highlight(res): hl = res.highlights('ini', top=1) if hl: for line in hl.split('\n'): if '[[' in line: return line.strip() with index_.searcher() as searcher: results = searcher.search(qp.parse(q), limit=limit) results.formatter = BracketFormatter() return (len(results), [ Languoid(r['id'], r.get('iso'), r['name'], r['level'], r['fname'], highlight(r)) for r in results ])
def summary_search(queries_list, return_objects=False, make_phrase=False): queries = [unicode(q).replace("\'", "").replace('\"', '') for q in queries_list] if make_phrase: queries = ["\"" + q + "\"" for q in queries] create_summary_index() ix = get_summary_index() parser = QueryParser("summary", schema=ix.schema) parser.add_plugin(PhrasePlugin()) with ix.searcher() as searcher: parsed_queries = [parser.parse(q) for q in queries] total_query = whoosh.query.Or(parsed_queries) results = searcher.search(total_query, limit=None) if return_objects: return [Bill.query.get(b['id']) for b in results] else: return [b['id'] for b in results]
def search(): window = Toplevel() scrollbar = Scrollbar(window) scrollbar.pack(side = RIGHT, fill=Y) mylist3 = Listbox(window, width=100, height=20,yscrollcommand=scrollbar.set) scrollbar.config(command = mylist3.yview) iC = open_dir("indexCorreo") qp = QueryParser("fecha", schema=iC.schema) qp.add_plugin(DateParserPlugin()) fecha = E1.get() date_now = datetime.now() date_email = datetime.strptime(fecha, "%Y%m%d") q = qp.parse(u"fecha:["+str(date_email)+"to"+str(date_now)+"]") with iC.searcher() as s: results = s.search(q) for result in results: mylist3.insert(END, "Remitente: %s, Destinatarios: %s, Asunto: %s" % (result["remitente"],result["destinatarios"],result["asunto"])) mylist3.pack(side = LEFT, fill = BOTH)
async def search(query_str, ctx): ix = open_dir("indexdir") parser = QueryParser("content", ix.schema) parser.add_plugin(qparser.FuzzyTermPlugin()) parser.add_plugin(GtLtPlugin()) parser.add_plugin(DateParserPlugin()) query = parser.parse(query_str) print(query) with ix.searcher(weighting=scoring.PL2) as searcher: results = searcher.search(query, limit=5) results.fragmenter = highlight.SentenceFragmenter() results.fragmenter.surround = 50 results.fragmenter.maxchars = 10000 results.formatter = DiscordBoldFormatter() embed = discord.Embed( title="Results", color=discord.Color(0x3cd63d), description="From search: **{}**".format(query_str)) for hit in results: # embed.add_field(name="[{}]({})".format(hit["title"], hit["url"]), value="{}".format(hit.highlights("content"))) embed.add_field(name="\u200b", value=f"[{hit['title']}]({hit['url']})\n" f"{hit.highlights('content', minscore=0)}", inline=False) await ctx.send(embed=embed)
def test_fuzzy_prefix(): from whoosh import scoring schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: # Match -> first w.add_document(title=u("First"), content=u("This is the first document we've added!")) # No match w.add_document(title=u("Second"), content=u("The second one is even more interesting! filst")) # Match -> first w.add_document(title=u("Third"), content=u("The world first line we've added!")) # Match -> zeroth w.add_document(title=u("Fourth"), content=u("The second one is alaways comes after zeroth!")) # Match -> fire is within 2 edits (transpose + delete) of first w.add_document(title=u("Fifth"), content=u("The fire is beautiful")) from whoosh.qparser import QueryParser, FuzzyTermPlugin parser = QueryParser("content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) q = parser.parse("first~2/3 OR zeroth", debug=False) assert isinstance(q, query.Or) ft = q[0] assert isinstance(ft, query.FuzzyTerm) assert ft.maxdist == 2 assert ft.prefixlength == 3 with ix.searcher(weighting=scoring.TF_IDF()) as searcher: results = searcher.search(q) assert len(results) == 4 assert (" ".join(sorted(hit["title"] for hit in results)) == "Fifth First Fourth Third")
def summary_search(queries_list, return_objects=False, make_phrase=False): queries = [ unicode(q).replace("\'", "").replace('\"', '') for q in queries_list ] if make_phrase: queries = ["\"" + q + "\"" for q in queries] create_summary_index() ix = get_summary_index() parser = QueryParser("summary", schema=ix.schema) parser.add_plugin(PhrasePlugin()) with ix.searcher() as searcher: parsed_queries = [parser.parse(q) for q in queries] total_query = whoosh.query.Or(parsed_queries) results = searcher.search(total_query, limit=None) if return_objects: return [Bill.query.get(b['id']) for b in results] else: return [b['id'] for b in results]
def __init__(self, url, headers, rows): self.url = url self.headers = headers self.rows = rows self.schema = Schema( name=TEXT(stored=False), alternative_names=TEXT(stored=False), id=ID(stored=True) ) self.index = RamStorage().create_index(self.schema) for c in [NAME_HEADER, ALT_NAMES_HEADER, TYPE_HEADER]: assert c in self.headers, 'Required "{}" column not found in {}'.format(c, url) name_idx = self.headers.index(NAME_HEADER) alt_names_idx = self.headers.index(ALT_NAMES_HEADER) writer = self.index.writer() for idx, row in enumerate(self.rows): name = row[name_idx] alt_names = row[alt_names_idx] writer.add_document( name=str(name), alternative_names=str(alt_names), id=str(idx) ) writer.commit() parser = QueryParser("name", self.index.schema) self.exact_name_query_parser = parser parser = QueryParser("name", self.index.schema) parser.add_plugin(FuzzyTermPlugin()) self.name_query_parser = parser parser = QueryParser("alternative_names", self.index.schema) parser.add_plugin(FuzzyTermPlugin()) self.alt_names_query_parser = parser
def searchNote(self): """ Sorting criteria: "title > path > content" Search matches are organized into html source. """ pattern = self.searchEdit.text() if not pattern: return results = [] with self.whoosh.ix.searcher() as searcher: matches = [] for f in ["title", "path", "content"]: queryp = QueryParser(f, self.whoosh.ix.schema) queryp.add_plugin(RegexPlugin()) # r"pattern" is the desired regex term format query = queryp.parse('r"' + pattern + '"') ms = searcher.search(query, limit=None) # default limit is 10! for m in ms: if not m in matches: matches.append(m) for r in matches: title = r['title'] path = r['path'] term = r.highlights("content") results.append([title, path, term]) html = """ <style> body { font-size: 14px; } .path { font-size: 12px; color: #009933; } </style> """ for title, path, hi in results: html += ("<p><a href='" + path + "'>" + title + "</a><br/><span class='path'>" + path + "</span><br/>" + hi + "</p>") self.searchView.setHtml(html)
def issue_search(queries_list, return_objects=False, make_phrase=False, case_sensitive=False): # Remove quotation marks queries = [q.replace("'", "").replace('"', '') for q in queries_list] if make_phrase: queries = ["\"" + q + "\"" for q in queries] ix = get_issue_index(case_sensitive=case_sensitive) parser = QueryParser("text", schema=ix.schema) parser.add_plugin(PhrasePlugin()) with ix.searcher() as searcher: parsed_queries = [parser.parse(q) for q in queries] q = whoosh.query.Or(parsed_queries) results = searcher.search(q, limit=None) # print " -", len(results), "results" if return_objects: return [LobbyingSpecificIssue.query.get(int(i['id'])) for i in results] else: return [i['id'] for i in results]
def search_date(entry): ix = index.open_dir("events") tk = Tk() scrollbar = Scrollbar(tk, orient="vertical") lb = Listbox(tk, width=50, height=20, yscrollcommand=scrollbar.set) scrollbar.config(command=lb.yview) scrollbar.pack(side="right", fill="y") lb.pack(side="left", fill="both", expand=True) date = str(entry) myquery = "date:<= " + date print("Myquery " + myquery) qp = QueryParser('fechaInicio', ix.schema) qp.add_plugin(DateParserPlugin()) t = qp.parse(u"date:"+date) print(t) with ix.searcher() as s: results_t = s.search(t, limit=None) for r in results_t: lb.insert(END, "Categorias: " + r["categorias"], "Título: " + r["titulo"], "Fecha: " + r["fechaInicio"], "") tk.mainloop()
def searchNote(self): """ Sorting criteria: "title > path > content" Search matches are organized into html source. """ pattern = self.searchEdit.text() if not pattern: return results = [] print("Searching using", pattern) with self.ix.searcher() as searcher: matches = [] queryp = QueryParser("content", self.ix.schema) #allow escaped qutoes when regex searching queryp.add_plugin( RegexPlugin(expr=r'r"(?P<text>[^"\\]*(\\.[^"\\]*)*)"')) # ~~r"pattern" is the desired regex term format~~ Don't autoforce regexing query = queryp.parse(pattern) #print("durp durp", query) ms = searcher.search(query, limit=None) # default limit is 10! for m in ms: #if not m in matches: matches.append(m) for r in matches: title = r['title'] path = r['path'] term = r.highlights("content") results.append([title, path, term]) html = "" for title, path, hi in results: html += ("<p><a href='" + path + "'>" + title + "</a><br/><span class='path'>" + path + "</span><br/>" + hi + "</p>") self.searchView.setHtml(html) print("Finished searching", pattern)
def run(args): ix = open_dir("urla.index") qp = QueryParser("content", ix.schema) qp.add_plugin(DateParserPlugin()) while True: try: query = unicode(raw_input("> ")) except EOFError: print sys.exit(0) with ix.searcher() as searcher: parsed = qp.parse(query) print parsed results = searcher.search(parsed) for result in results: timestamp = result["when"].strftime("%Y-%m-%d") print "%s %s" % (timestamp, result["content"].encode("utf-8"))
def searchNote(self): """ Sorting criteria: "title > path > content" Search matches are organized into html source. """ pattern = self.searchEdit.text() if not pattern: return results = [] print("Searching using", pattern) with self.ix.searcher() as searcher: matches = [] queryp = QueryParser("content", self.ix.schema) #allow escaped qutoes when regex searching queryp.add_plugin(RegexPlugin(expr=r'r"(?P<text>[^"\\]*(\\.[^"\\]*)*)"')) # ~~r"pattern" is the desired regex term format~~ Don't autoforce regexing query = queryp.parse(pattern) #print("durp durp", query) ms = searcher.search(query, limit=None) # default limit is 10! for m in ms: #if not m in matches: matches.append(m) for r in matches: title = r['title'] path = r['path'] term = r.highlights("content") results.append([title, path, term]) html = "" for title, path, hi in results: html += ("<p><a href='" + path + "'>" + title + "</a><br/><span class='path'>" + path + "</span><br/>" + hi + "</p>") self.searchView.setHtml(html) print("Finished searching", pattern)
def search_index(words): xg_part = [] with ix.searcher() as s: # group = qparser.OrGroup 表示可匹配任意查询词,而不是所有查询词都匹配才出结果 qp = QueryParser('part', schema=ix.schema, group=qparser.OrGroup) # 下面两行表示可以使用通配符,如“窗前*月光” qp.remove_plugin_class(qparser.WildcardPlugin) qp.add_plugin(qparser.PrefixPlugin()) # 随机数 num = random.randint(3, 7) for word in words: q = qp.parse(u'%s' % word) # limit 表示多少条搜索结果 results = s.search(q, limit=num) count = 0 for i in results: if count > 0: # 防止等于本身 xg_part.append((i['pid'], i['part'])) count += 1 return xg_part
def buscar_apartadob(self, query): if not self.indice: tkMessageBox.showerror('Error', 'No existe ningún índice.\nPor favor, cree un índice y reintente la búsqueda.') else: result = [] indice = self.indice searcher = indice.searcher() qp = QueryParser('fecha', schema=self.schema) qp.add_plugin(FieldsPlugin()) qp.add_plugin(RangePlugin()) qp.add_plugin(GtLtPlugin()) q = qp.parse(unicode(query)) with searcher as s: busqueda = s.search(q) result = [[correo['remitente'], correo['destinatarios'], correo['asunto']] for correo in busqueda] return result
def buscar_indice(self, campo, query): if not self.indice: tkMessageBox.showerror('Error', 'No existe ningún índice.\nPor favor, cree un índice y reintente la búsqueda.') else: result = [] indice = self.indice searcher = indice.searcher() qp = QueryParser(campo, schema=self.schema) qp.add_plugin(FieldsPlugin()) qp.add_plugin(RangePlugin()) qp.add_plugin(GtLtPlugin()) q = qp.parse(unicode(query)) with searcher as s: busqueda = s.search(q) result = [correo['numero'] for correo in busqueda] return result
# You can specify None for the schema to create a parser that does not analyze the text of the query, usually for testing purposes. parser = QueryParser("content", ix.schema) # ix.schema 和 schema 是相同的东西 print(len(parser.plugins), parser.plugins) # 11 # [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, # <whoosh.qparser.plugins.FieldsPlugin>, <whoosh.qparser.plugins.WildcardPlugin>, <whoosh.qparser.plugins.PhrasePlugin>, # <whoosh.qparser.plugins.RangePlugin>, <whoosh.qparser.plugins.GroupPlugin>, <whoosh.qparser.plugins.OperatorsPlugin>, # <whoosh.qparser.plugins.BoostPlugin>, <whoosh.qparser.plugins.EveryPlugin>] ## default_set(): Returns the default list of plugins to use. print(len(parser.default_set()), parser.default_set()) # 10 # [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, <whoosh.qparser.plugins.FieldsPlugin>, # <whoosh.qparser.plugins.WildcardPlugin>, <whoosh.qparser.plugins.PhrasePlugin>, <whoosh.qparser.plugins.RangePlugin>, # <whoosh.qparser.plugins.GroupPlugin>, <whoosh.qparser.plugins.OperatorsPlugin>, <whoosh.qparser.plugins.BoostPlugin>, # <whoosh.qparser.plugins.EveryPlugin>] parser.remove_plugin_class(whoosh.qparser.plugins.WildcardPlugin) print(len(parser.plugins), len(parser.default_set())) # 10 10 parser.add_plugin(qparser.PrefixPlugin) print(len(parser.plugins), len(parser.default_set())) # 11 10 ## parse(text, normalize=True, debug=False) Parses the input string and returns a :class:`whoosh.query.Query` object/tree. query = parser.parse('document') ## search(q, **kwargs) Runs a :class:`whoosh.query.Query` object on this searcher and returns a :class:`Results` object. # See :doc:`/searching` for more information. results = searcher.search(query) # 检索 "content" 中出现 "document" print(results) # <Top 1 Results for Term('content', 'document') runtime=0.0015511049998622184> print(type(results)) # <class 'whoosh.searching.Results'> ## 查询方法二: 上面两行是只用方法, 下面一行也形 ## find(defaultfield, querystring, **kwargs) results = searcher.find("title", "document") # 检索标题中出现 'document' 的文档 print(results) # <Top 2 Results for Term('title', 'document') runtime=0.0008875329999682435> print(type(results)) # <class 'whoosh.searching.Results'>; 和上面第一种方法得到的结果一样
# You can specify None for the schema to create a parser that does not analyze the text of the query, usually for testing purposes. parser = QueryParser("content", ix.schema) # ix.schema 和 schema 是相同的东西 print(len(parser.plugins), parser.plugins) # 11 # [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, # <whoosh.qparser.plugins.FieldsPlugin>, <whoosh.qparser.plugins.WildcardPlugin>, <whoosh.qparser.plugins.PhrasePlugin>, # <whoosh.qparser.plugins.RangePlugin>, <whoosh.qparser.plugins.GroupPlugin>, <whoosh.qparser.plugins.OperatorsPlugin>, # <whoosh.qparser.plugins.BoostPlugin>, <whoosh.qparser.plugins.EveryPlugin>] ## default_set(): Returns the default list of plugins to use. print(len(parser.default_set()), parser.default_set()) # 10 # [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, <whoosh.qparser.plugins.FieldsPlugin>, # <whoosh.qparser.plugins.WildcardPlugin>, <whoosh.qparser.plugins.PhrasePlugin>, <whoosh.qparser.plugins.RangePlugin>, # <whoosh.qparser.plugins.GroupPlugin>, <whoosh.qparser.plugins.OperatorsPlugin>, <whoosh.qparser.plugins.BoostPlugin>, # <whoosh.qparser.plugins.EveryPlugin>] parser.remove_plugin_class(whoosh.qparser.plugins.WildcardPlugin) print(len(parser.plugins), len(parser.default_set())) # 10 10 parser.add_plugin(qparser.PrefixPlugin) print(len(parser.plugins), len(parser.default_set())) # 11 10 ## parse(text, normalize=True, debug=False) Parses the input string and returns a :class:`whoosh.query.Query` object/tree. query = parser.parse('document') ## search(q, **kwargs) Runs a :class:`whoosh.query.Query` object on this searcher and returns a :class:`Results` object. # See :doc:`/searching` for more information. results = searcher.search(query) # 检索 "content" 中出现 "document" print( results ) # <Top 1 Results for Term('content', 'document') runtime=0.0015511049998622184> print(type(results)) # <class 'whoosh.searching.Results'> ## 查询方法二: 上面两行是只用方法, 下面一行也形 ## find(defaultfield, querystring, **kwargs) results = searcher.find("title", "document") # 检索标题中出现 'document' 的文档 print(
import datetime import simplejson as json from flask import Flask, request, render_template, Response from whoosh.index import open_dir from whoosh.qparser import QueryParser from whoosh.qparser.dateparse import DateParserPlugin from search_index import TweetSchema app = Flask(__name__) search_index = open_dir("index") parser = QueryParser("text", TweetSchema()) parser.add_plugin(DateParserPlugin()) class APIEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, (datetime.datetime, datetime.date)): return obj.isoformat() return json.JSONEncoder.default(self, obj) def jsonify(**data): return Response(json.dumps(data, cls=APIEncoder), mimetype='application/json') @app.route("/") def index(): return render_template("index.html") @app.route("/search")
def main(): ix = open_dir("index") searcher = ix.searcher() docid=1 if len(sys.argv)>=2: input=sys.argv[1] else: print 'You need to type a file name\nType python search.py filename' exit() queries=[] with open(input, 'r') as f: for line in f: queries.append(line) qp = QueryParser("desc", schema=ix.schema) qp.add_plugin(qparser.WildcardPlugin()) qp.add_plugin(qparser.PrefixPlugin()) qp.add_plugin(qparser.RegexPlugin()) f= open('testresult.txt', 'w') for query in queries: myquery = qp.parse(query) searcher = ix.searcher() results = searcher.search(myquery, limit=400) print "\nYou are searching:" all_terms=list(myquery.iter_all_terms()) print query.strip() print 'number of hits' print (len(results)) if (len(results))==0: continue whooshresults =[int(x['index']) for x in results] # hits myresults=[x['index'] for x in results] vectorizer = TfidfVectorizer(encoding="latin-1", stop_words='english') corpus=[x['desc'] for x in results] dt=[x['dt'] for x in results] latlong=[x['latlong'] for x in results] #pageranding part X = vectorizer.fit_transform(corpus) X= X.toarray() similarity=cosine_similarity(X,X) G = nx.Graph() for i in range(len(myresults)): G.add_node(myresults[i]) for i in range(len(similarity)): for j in range(i+1,len(similarity[i])): delta=abs(dt[i]-dt[j]) loci=latlong[i][1:-1].split(',') locj=latlong[j][1:-1].split(',') loci=(loci[0],loci[1]) locj=(locj[0],locj[1]) #if similarity[i][j]>=0.1: # G.add_edge(myresults[i],myresults[j], weight=similarity[i][j]) if similarity[i][j]>=0.18 and delta<datetime.timedelta(days=7) and vincenty(loci, locj).miles<100: G.add_edge(myresults[i],myresults[j], weight=(similarity[i][j])) nxresult=sorted(nx.pagerank(G, alpha=0.85).items(), key=lambda x:-x[1]) #query expansion keywords = [keyword for keyword, score in results.key_terms("desc", docs=30, numterms=(len(all_terms)+1))] #get expanded query keywords=[('desc', x) for x in keywords] newterms=[x for x in keywords if x not in all_terms] newterms=newterms+all_terms newterms=[x[0]+':'+x[1] for x in newterms] newterms=' '.join(newterms) #expanded query print 'Do you want search?' print newterms newquery = qp.parse(newterms) print newquery searcher = ix.searcher() expansionresults = searcher.search(newquery, limit=1000) nxdesc=[] for j in nxresult[0:10]: doc=searcher.document(index=j[0]) nxdesc.append(j[0]+'\t'+doc['desc']) printpagerank(nxresult) printtfidf(whooshresults) printquery_expansion(expansionresults, newquery) visualize(G, nxresult) #writing output f.write('\n\n'+query) f.write("\n\nTop 10 query expansion results baseline result\n") f.write(('\n'.join([x['index']+'\t'+x['desc'] for x in results][0:10])).encode('utf8')) f.write("\n\nTop 10 ranking by nx pageranking\n") f.write(('\n'.join(nxdesc)).encode('utf8')) f.write("\n\nTop 10 query expansion results\n") f.write(('\n'.join([x['index']+'\t'+x['desc'] for x in expansionresults][0:10])).encode('utf8')) f.close()
abstract = TEXT authors = TEXT(stored=True) year = NUMERIC(stored=True) month = NUMERIC(stored=True) day = NUMERIC(stored=True) review = BOOLEAN(stored=True) journal = STORED volume = STORED pages = STORED ix = index.create_in(ABSTRACT_INDEX_PATH, Schema) # query parser and searcher parser = QueryParser('abstract',ix.schema) parser.add_plugin(PhrasePlugin) searcher = ix.searcher(weighting=BM25F) # facet object for sorting abstracts by date (some have years but not dates) datefacet = MultiFacet() datefacet.add_field('year') datefacet.add_field('month') datefacet.add_field('day') #Builds Query def buildquery(keywords=None): # get keyword branch of query print "keywords (buildquery input) ==", keywords keywords = keywords.decode("utf-8")
def search(self, query): #self.message(u"Searching for: \"{}\".".format(query)) parser = QueryParser(self.default_field, self.schema, group=OrGroup) parser.add_plugin(FuzzyTermPlugin()) parsed_query = parser.parse(query) return self.searcher.search(parsed_query)
def update_metabolites(db): """ Find metabolites mentioned in new articles, and insert new records into the metabolite_abstract table in the database. (For each metabolite in the metabolite_info.txt file, search against the temporary whoosh index containing only new articles.) """ logger.debug('Scanning for metabolites') # Don't open the index until this enclosing function is called, because # we'll be deleting it and re-creating it in a previous state of the # update process. ix = open_index(TEMP_METABOLITE_INDEX_PATH) cursor = getcursor(db) # query parser and searcher parser = QueryParser('abstract',ix.schema) parser.add_plugin(PhrasePlugin) searcher = ix.searcher(weighting=BM25F) #Get all common names so they don't repeat #outfile = open('metabolite2pubmed.txt','w') #mapping file common_name_set = set() with open('metabolite_info.txt')as f: for line in f: if line.startswith('HMDB'): synonym_line=f.next().strip() synonyms = synonym_line.split('\t') common_name = synonyms[0] #print(common_name) common_name_set.add(common_name) #search abstracts and write to metabolite2pubmed.txt with open('metabolite_info.txt') as f: for line in f: if line.startswith('HMDB'): #outfile.write(line) #Write ID to file (line 1) hmdb_id = line.strip() synonym_line = f.next().strip() #outfile.write(synonym_line) synonyms = synonym_line.split('\t') common_name = synonyms[0] printsyn = common_name + '\t' for s in synonyms: if s in common_name_set and s != common_name: synonyms.remove(s) continue if s == common_name: continue printsyn = printsyn + '\t' +s #outfile.write(printsyn+'\n') #Write synonyms to file (line 2) reference_line = f.next().strip() references = set(reference_line.split('\t')) if '\n' in references: references.remove('\n') for name in synonyms: query = '"' + name + '"' #performs complete query results = get_abstracts(parser, searcher, query) #searches with get_abstracts useing "line" as the search keyword for item in results: references.add(str(item)) rlist = list(references) insert_db_records(cursor, hmdb_id, rlist) #rline = '\t'.join(references) + '\n' #outfile.write(rline) #Write references to file (line 3) logger.info('updated metabolite-abstract links')
def main(query: ("Query", 'option', 'q'), arg_sentence=None, ): # test_data = SENTENCES # test_data = get_test_data(config.TEST_DATA_CSV) if arg_sentence: test_data = [(arg_sentence, [])] else: test_data = [ # ("Do you have something like the 2005 Zinfandel of Turley?".lower(), []), ("redd wine nappa chateau latoor", []), ("nappa valley", ['napa valley']), ("latour", ['chateau latour']), ("red chateu latour", ['red', 'chateau latour']), ("red", ['red']), ("red chateau lator", ['red', 'chateau latour']), ("cabernet sauvignon", ['cabernet sauvignon']), ("caubernet sauvignon", ['cabernet sauvignon']), ("cabernet savignon", ['cabernet sauvignon']), ("caubernet sauvignon", ['cabernet sauvignon']), ("how are yoou", []), ("chateu meru lator", ['merus', 'chateau latour']), ("chateau lator", ['chateau latour']), ("blak opul", ['black opal']), ("red caubernet sauvignon", ['red', 'cabernet sauvignon']) ] print() print() success = 0 total = len(test_data) if query: with magia_search._searcher(weighting=scoring.TF_IDF()) as s: qp = QueryParser(TEXT_FIELD, schema=magia_search._schema) qp.add_plugin(FuzzyTermPlugin) q = qp.parse(query) magia_search.get_search_results(ix, s, q) sys.exit() failed = [] for chunk, expected in test_data: orig_chunk = chunk print("Input chunk: {}".format(chunk)) start_time = datetime.now() result = lookup_attributes(remove_stopwords(chunk)) if sorted(result) == sorted(expected): success += 1 cprint('Success', foreground="green", background="black") else: cprint('Fail', foreground="red", background="black") failed.append((chunk, result, expected)) print('Completed in {}'.format(datetime.now() - start_time)) print('Expected', expected) print('Got:', result) print('--------------') print() print("{}/{} tests passed. {}%".format(success, total, success * 100 // total)) if failed: print() cprint('Failed', foreground="red", background="black") for chunk, result, expected in failed: print('*IN: {} *OUT: {} *EXPECTED: {}'.format(chunk, result, expected))
def MultiFieldWordNetParser(fieldnames, schema, fieldboosts=None, expansion=1, **kwargs): p = QueryParser(None, schema, **kwargs) mfp = WordnetPlugin(fieldnames, fieldboosts=fieldboosts, expansion=expansion) p.add_plugin(mfp) return p
for i in range(0,len(continents)): writer.add_document(city_name=capitalName[i],country_name=countryName[i],continent=continents[i],city_text=capitalText[i],country_text=countryText[i]) writer.commit() ###2 ##find cities with ix.searcher() as searcher: parser = QueryParser("city_text",ix.schema) #greek & roman -persian myquery = parser.parse('Greek AND Roman NOT Persian') results = searcher.search(myquery,limit=None) for result in results: print(result['city_name']) #shakespeare incl mispelled parser.add_plugin(FuzzyTermPlugin()) myquery = parser.parse(u'Shakespeare~3') results = searcher.search(myquery,limit=None) for result in results: print(result['city_name']) #located below sea level # parser.remove_plugin_class(PhrasePlugin) # parser.add_plugin(SequencePlugin()) # myquery = parser.parse("located below sea level~10") # myquery = SpanNear.phrase("city_text",["located","below","sea","level"],slop=10) myquery = Phrase("city_text",list([unicode("located"),unicode("below"),unicode("sea"),unicode("level")]),slop=10) results = searcher.search(myquery,limit=None) for result in results: print(result['city_name']) ###3
from whoosh.index import create_in from whoosh.fields import * schema = Schema(title=TEXT(stored=True), content=TEXT) ix = create_in("indexdir", schema) writer = ix.writer() writer.add_document(title=u"First document", content=u"This is the first document we've added!") writer.add_document(title=u"Second document", content=u"The second one is even more interesting!") writer.add_document(title=u"Third document", content=u"letter first, stamp second, mail third") writer.add_document(title=u"Fourth document", content=u"stamp first, mail third") writer.add_document(title=u"Fivth document", content=u"letter first, mail third") writer.add_document(title=u"Sixth document", content=u"letters first, stamps second, mail third") writer.add_document(title=u"Seventh document", content=u"stamp first, letters second, mial third") writer.commit() from whoosh.qparser import QueryParser, FuzzyTermPlugin, PhrasePlugin, SequencePlugin with ix.searcher() as searcher: parser = QueryParser(u"content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) parser.remove_plugin_class(PhrasePlugin) parser.add_plugin(SequencePlugin()) query = parser.parse(u"Apple iphone 6") print query results = searcher.search(query) print "nb of results =", len(results) for r in results: print r
def predict_TF_IDF(data, docs_per_q): # index docs exclude = set(string.punctuation) res = [] for idx, row in data.iterrows(): print row["id"] # get answers words w_A = set(utils.tokenize(row["answerA"])) w_B = set(utils.tokenize(row["answerB"])) w_C = set(utils.tokenize(row["answerC"])) w_D = set(utils.tokenize(row["answerD"])) sc_A = 0 sc_B = 0 sc_C = 0 sc_D = 0 q_punc = row["question"] # first thing to debug if not working question = "".join(ch for ch in q_punc if ch not in exclude) qp = QueryParser("content", schema=schema, group=qparser.OrGroup) qp.add_plugin(qparser.FuzzyTermPlugin()) qp.remove_plugin_class(qparser.PhrasePlugin) qp.add_plugin(qparser.SequencePlugin()) q = qp.parse(unicode(question, "utf-8")) # q = qp.parse('physics') # cp = qparser.CompoundsPlugin( AndMaybe="&~") with ix.searcher() as s, ix.searcher(weighting=scoring.TF_IDF()) as scoring_searcher_tfidf: results = s.search(q, limit=docs_per_q) """ u_id = unicode(uuid.uuid1()) if not os.path.exists("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id): os.mkdir("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id) q_ix = index.create_in("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id, schema) q_writer = q_ix.writer() for document in results: q_writer.add_document(article_title=document['article_title'], content=document['content']) q_writer.commit() """ # with q_ix.searcher(weighting=scoring.TF_IDF()) as scoring_searcher_tfidf for document in results: doc_parser = QueryParser("content", schema=schema) doc_q = doc_parser.parse(u"article_title:%s" % document["article_title"]) for w in w_A: try: sc_A += ( scoring.TF_IDF() .scorer(scoring_searcher_tfidf, "content", w) .score(doc_q.matcher(scoring_searcher_tfidf)) ) except TermNotFound: pass for w in w_B: try: sc_B += ( scoring.TF_IDF() .scorer(scoring_searcher_tfidf, "content", w) .score(doc_q.matcher(scoring_searcher_tfidf)) ) except TermNotFound: pass for w in w_C: try: sc_C += ( scoring.TF_IDF() .scorer(scoring_searcher_tfidf, "content", w) .score(doc_q.matcher(scoring_searcher_tfidf)) ) except TermNotFound: pass for w in w_D: try: sc_D += ( scoring.TF_IDF() .scorer(scoring_searcher_tfidf, "content", w) .score(doc_q.matcher(scoring_searcher_tfidf)) ) except TermNotFound: pass res.append(["A", "B", "C", "D"][np.argmax([sc_A, sc_B, sc_C, sc_D])]) return res
class Query(object): def __init__(self): self.ix = index.open_dir(config.index_file_path) self.ix2 = index.open_dir(config.index2_file_path) # Instatiate a query parser self.qp = QueryParser("content", self.ix.schema) # Add the DateParserPlugin to the parser self.qp.add_plugin(DateParserPlugin()) self.qp.add_plugin(WildcardPlugin()) self.qp.add_plugin(PrefixPlugin()) self.qp.add_plugin(RegexPlugin()) ## 将search返回的结果解析成json格式,用于前端展示 def _results_todata(self, results): data = {} if isinstance(results, Results): data["total"] = results.estimated_length() elif isinstance(results, ResultsPage): data['total'] = results.total result_list = [] for result in results: item = {} for key in result.keys(): item[key] = result.get(key) import re match_class = re.compile('class="match term[0-9]"') item['description'] = match_class.sub(" ", str(result.highlights('content'))) \ .replace(" ", "").replace("\r\n", "").replace("\n", "") item['description'] = self.truncate_description( item['description']) item['docnum'] = result.docnum result_list.append(item) data["results"] = result_list return data def _results_tohotdata(self, results): from datetime import datetime, timedelta now = datetime.now() daySeconds = 86400 weekSeconds = daySeconds * 7 monthSecond = weekSeconds * 30 data = {} if isinstance(results, Results): data["total"] = results.estimated_length() elif isinstance(results, ResultsPage): data['total'] = results.total result_list = [] i = 0 for result in results: i = i + 1 item = {} for key in result.keys(): item[key] = result.get(key) timespan = (now - item['publish_time']).seconds if timespan > daySeconds: if timespan < weekSeconds: item['hotScore'] = result.score * 1 else: item['hotScore'] = result.score * 0.5 else: item['hotScore'] = result.score * 1.5 import re match_class = re.compile('class="match term[0-9]"') item['description'] = match_class.sub(" ", str(result.highlights('content'))) \ .replace(" ", "").replace("\r\n", "").replace("\n", "") item['description'] = self.truncate_description( item['description']) item['docnum'] = result.docnum result_list.append(item) if i == 100: result_list = sorted(result_list, key=lambda results: results['hotScore']) if i < 100: result_list = sorted(result_list, key=lambda results: results['hotScore']) data["results"] = result_list return data ## 搜索功能,每次搜索一页 def query_page(self, term, page_num, page_len, sort_type): with self.ix.searcher() as searcher: if sort_type == 1: # default sorted results = searcher.search_page(self.qp.parse( term), pagenum=page_num, pagelen=page_len,sortedby=ScoreFacet()) #results2 = searcher.search_page(self.qp.parse( # term), pagenum=page_num, pagelen=page_len, sortedby=ScoreAndTimeFacet()) #self.generate_similarQuery(results,term) if sort_type == 2: # sorted by custom hot value publish_time = FieldFacet("publish_time", reverse=True) results = searcher.search_page(self.qp.parse( term), pagenum=page_num, pagelen=page_len, sortedby=publish_time) if sort_type == 3: # sorted by time publish_time = FieldFacet("publish_time", reverse=True) results = searcher.search_page(self.qp.parse( term), pagenum=page_num, pagelen=page_len, sortedby=ScoreAndTimeFacet()) return self._results_todata(results), results.results.runtime ## 截断正文内容,避免过长 def truncate_description(self, description): """ Truncate description to fit in result format. """ if len(description) <= 160: return description cut_desc = description[:160] i = 160 letter = description[i] length = len(description) while i < length - 1 and not (letter == ',' or letter == ',' or letter == '.' or letter == '。'): cut_desc += letter i = i + 1 letter = description[i] cut_desc += letter # print(cut_desc) return cut_desc # 计算句子的TF-IDF def cal_TF_IDF(self,words,countKey): with self.ix.searcher(weighting=scoring.TF_IDF()) as searcher_tfidf: #words = list(jieba.cut(sentence)) count = 0 score = 0 for word in words: #if word == u'的' or word == u'地' or word == u'和': # continue count += 1 try: tf = searcher_tfidf.term_info('content', word).max_weight() except: tf = 0.1 score += searcher_tfidf.idf('content',word) * tf if count == 0: return 0 else: return countKey * score / count def sentenceFind(self,sentence,terms): for term in terms: if sentence.find(term) != -1 : return 1 return 0 # 在20个句子中 选取5个包含关键词的较高TF-IDF句子 def generate_similarQuery(self, results, query_str): import re word_count = 0 # 句子数量 keywords = [] items = [] similarQuery = [] terms = [] keywords = re.split(" ", query_str) for keyword in keywords: temps = list(jieba.cut(keyword)) for temp in temps: if len(temp) == 0 : continue terms.append(temp) for result in results[0:9]: content_count = 0 content = result.get('content') content = content.replace(" ", ",") sentences = re.split(r"[,|.|,|。|!|!|?|?|:|:|;|;|……|、]", content) for sentence in sentences: item = {} countKey = 0 #for keyword in keywords: for term in terms: #terms = jieba.cut(keyword) #self.sentenceFind(sentence,terms) #if sentence.find(keyword) != -1: #if self.sentenceFind(sentence,keyword) != 0: if sentence.find(term) != -1: countKey += 1 continue if countKey == 0: continue pattern = re.compile(r'[^\u4e00-\u9fa5]') sentence_cn = re.sub(pattern, '', sentence) words = list(jieba.cut(sentence_cn)) if len(words) > 8 or len(words) < 3: continue #score = self.cal_TF_IDF(re.sub(pattern, '', words)) score = self.cal_TF_IDF(words,countKey) item['sentence'] = sentence item['score'] = score items.append(item) word_count += 1 content_count += 1 if content_count > 2: # 文章中有5个以上句子 break if word_count >= 30: # 只挑选20个句子 break if word_count >= 30: break #items = list(set(items)) items.sort(key=lambda temp : temp['score'],reverse=True) count = 0 #SentenceFilter = "" last_score = 0 for item in items: if count >= 5: continue if last_score == item["score"]: continue similarQuery.append(item["sentence"]) count += 1 last_score = item['score'] return similarQuery ## 根据关键词生成snippet def generate_snippet_from_keyword(self, content, keywords): content = content.replace(" ", "") import re sentences = re.split(r"[,|.|,|。|!|!|?|?]", content) snippet = "" count = 0 for sentence in sentences: for keyword in keywords: if sentence.find(keyword) > 0: # print(keyword, sentence) snippet = snippet + "," + sentence keywords.remove(keyword) break if len(keywords) == 0: return snippet[1:] + "。" return snippet[1:] + "。" def get_hot_words(self): import re keywords = [] searchitem = '' word = '' reader = self.ix2.reader() sentences = list(reader.field_terms('content')) for sentence in sentences: words = re.split(r"0xffff",sentence) for word in words: searchitem = searchitem + word + ' ' searchitem = searchitem.strip() keywords.append(searchitem) searchitem = '' return keywords ## 根据关键词生成推荐新闻,并生成摘要 def recommend_news(self): data = {} total = 0 result_list = [] keywords = self.get_hot_words() data["results"] = result_list with self.ix.searcher() as searcher: for keyword in keywords: results = searcher.search(self.qp.parse(keyword), limit=1) # keywords = [keyword for keyword, score # in results.key_terms("content", docs=10, numterms=5)] # print(keywords) item = {} for result in results: total = total + 1 for key in result.keys(): item[key] = result.get(key) item["keywords"] = [keyword[0] for keyword in searcher.key_terms([result.docnum], "content")] item["snippet"] = self.generate_snippet_from_keyword(item['content'], item['keywords']) print(item['snippet']) result_list.append(item) break data['total'] = total data['results'] = result_list return data def get_recommend_query(self, term): recom_query = [] with self.ix.searcher() as searcher: results = searcher.search_page( self.qp.parse(term), pagenum=1, pagelen=10) recommends = self.generate_similarQuery(results, term) for recommend in recommends: item = {} item['term'] = recommend recom_query.append(item) # for result in results: # #self.generate_similarQuery(results, term) # item = {} # item['term'] = result['title'] # recom_query.append(item) return recom_query def search_more_like_this(self, url, fieldname, top): with self.ix.searcher() as searcher: docnum = searcher.document_number(url=url) results = searcher.more_like(docnum, fieldname, text=None, top=top, numterms=5, model=Bo1Model, normalize=True, filter=None) return self._results_todata(results)
def search(terms, limit=50, time_slice=None): big_tables = {} for i in cats: big_tables[i]=[] f = open("./search_results.html", "w+") master_str = "<!DOCTYPE html><html><style>hr {border: 4;width: 80%;}</style><title>Search Results [term(s): "+terms+"]</title><body><br>" ix = index.open_dir("cl_index", indexname="CL") w = ix.writer() qp = QueryParser("content", schema=w.schema) qp.add_plugin(DateParserPlugin()) qp.add_plugin(GtLtPlugin()) q = qp.parse(terms) with w.searcher() as s: results = s.search(q, limit=limit) if time_slice != None: within = [] start = int("".join(time_slice[0].split(":"))) end = int("".join(time_slice[1].split(":"))) if (0<=start<=2400) and (0 <=end<=2400): for res in results: time = res["posted"] if time.minute < 10: t = int(str(time.hour)+"0"+ str(time.minute)) else: t = int(str(time.hour)+ str(time.minute)) if start < end and start <= t <=end: within.append(res) elif end < start and (start <= t or t <= end): within.append(res) else: pass results = within else: print "Invalid time slice, no results returned." results = [] print "%d search results" % len(results) print "--"*15 for res in results: to_nums(res["liwc"], big_tables) master_str += to_html(res, True) master_str += "</body></html>" f.write(master_str) f.close() res_str = "<!DOCTYPE html><html><title>LIWC statistics for term(s): "+terms+"</title><body><br>" res_str += "<table><tr>"+("<th>Category </th><th>Average</th><th>Std Dev</th><th>Max </th><th>Min </th>"*3)+"</tr>" count = 0 for_later = {} for j in big_tables.keys(): vals = big_tables[j] outputs = [] if len(vals) != 0: avg = sum(vals)/len(vals) outputs.append(round(avg,4)) var = [(i-avg)**2 for i in vals] std = math.sqrt(sum(var)/len(var)) outputs.append(round(std,4)) outputs.append(round(max(vals),4)) outputs.append(round(min(vals),4)) else: outputs = ["NA","NA","NA","NA"] if count%3 == 0: res_str+= "<tr>" res_str += "<td>"+str(j)+"</td><td>"+str(outputs[0])+"</td><td>"+str(outputs[1])+"</td><td>"+str(outputs[2])+"</td><td>"+str(outputs[3])+"</td>" count +=1 if count%3 == 0: res_str+= "</tr>" for_later[j] = outputs res_str+="</table>" if big_tables["WC"] == []: big_tables = "" res_str = "<!DOCTYPE html><html><title>LIWC statistics for term(s): "+terms+"</title><body><br>" res_str += "<p>No matches found </p></body></html>" t = open("./search_averages.html", "w+") t.write(res_str) t.close() return res_str, for_later, master_str