def img_search(command): envir.vm_env.attachCurrentThread() command_dict = parseCommand(command, "imgtitle") querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, envir.analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = envir.img_searcher.search(querys, 30).scoreDocs res = [] query_highlight = QueryParser(Version.LUCENE_CURRENT, k, envir.analyzer).parse(command_dict["imgtitle"]) myhighlighter = Highlighter( SimpleHTMLFormatter(), QueryScorer(query_highlight)) myhighlighter.setTextFragmenter(SimpleFragmenter(50)) for scoreDoc in scoreDocs: # find texts which are around the keyword doc = envir.img_searcher.doc(scoreDoc.doc) text = doc.get("imgtitle") key_text = "".join((myhighlighter.getBestFragment( envir.analyzer, "imgtitle", text))) key_text = re.sub('\s', '', key_text) temp = [key_text, doc.get('imgurl'), doc.get("url"), doc.get('price')] res.append(temp) return res
def output(self, score_docs, command): ''' Highlight and return the search results. Input: `score_docs`: search results from the index Output: list of documents info found in the index, details includes `title`, `url` and `abstract` ''' query = QueryParser('contents', self.analyzer).parse(command) highlighter = Highlighter(self.formatter, QueryScorer(query)) highlighter.setTextFragmenter( SimpleFragmenter(200)) # Limit the max number of characters results = [] for score_doc in score_docs: doc = self.searcher.doc(score_doc.doc) contents = doc.get('contents') stream = self.analyzer.tokenStream("contents", contents) abstract = highlighter.getBestFragment( stream, contents) # Get the abstract and highlight result = { 'title': doc.get('title'), 'url': doc.get('url'), 'abstract': abstract.replace(' ', '') } results.append(result) return results
def search(self, query_str, restriction=2): self.attachCurrentThread() # 对query进行解析 result_contexts = [] # 根据有没有‘/’判断有没有词性, if '/' in query_str: # 有词性就转到search_phrases result_contexts = self.search_phrases(query_str) else: # 有词性就转到search_terms result_contexts = self.search_terms( QueryParser("context", self.analyzer).parse(query_str)) # 将搜索结果复原为文章返回 self.recover_to_article(query_str, result_contexts, restriction) final_result = [] #进行搜索结果中跟query相关的文段高量处理 simpleHTMLFormatter = SimpleHTMLFormatter(u"<b><font color='red'>", u"</font></b>") for index, recovered_query in enumerate(self.recovered_queries): # 不是直接拿用户输入的query来进行高亮处理,而是通过我们自己处理好的包含了位置约束的query进行高亮处理 recovered_query = recovered_query.replace("/", ",") highlighter = Highlighter( simpleHTMLFormatter, QueryScorer( QueryParser("context", self.analyzer).parse(recovered_query))) highLightText = highlighter.getBestFragment( self.analyzer, 'context', self.recovered_contexts[index]) if highLightText is not None: final_result.append(highLightText) return final_result
def search_synonym(self, query): self.hits_dict = {} self.hits = [] similar_terms = self.w2v_model.most_similar(query) parser = QueryParser('text', self.analyzer) query = parser.parse(query) for s_term in similar_terms[:20]: s_term_query = parser.parse(s_term[0]) hits = self.searcher.search(s_term_query, 1000).scoreDocs hit_count = 0 for hit in hits: doc = self.searcher.doc(hit.doc) text = doc.get('text') terms = text.split() sentence = '' for term in terms: sentence += term simpleHTMLFormatter = SimpleHTMLFormatter( prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) highLightText = highlighter.getBestFragment( self.analyzer, 'text', sentence) if highLightText is not None: self.hits.append(highLightText) hit_count += 1 if hit_count >= 3: break if len(self.hits) > 0: self.hits_dict[s_term] = self.hits self.hits = [] return self.hits_dict
def run(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index1" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(analysis(command)) HighlightFormatter = SimpleHTMLFormatter() highlighter = Highlighter(HighlightFormatter, QueryScorer(query)) scoreDocs = searcher.search(query, 500).scoreDocs print "%s total matching documents." % len(scoreDocs) result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get( "name"), 'url:', doc.get("url"), 'title:', doc.get("title") text = doc.get('contents') highLightText = highlighter.getBestFragment(analyzer, "contents", text) if highLightText != None: highLightText = ''.join(highLightText.split(' ')) data = {} data['url'] = doc.get("url") data['title'] = doc.get('title') data['highlight'] = highLightText result.append(data) return result
def search_multi_terms(self, query): print('Multiterms search') hits = self.searcher.search(query, 100).scoreDocs for hit in hits: doc = self.searcher.doc(hit.doc) text = doc.get("text") terms = text.split() sentence = '' for term in terms: sentence += term simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) highLightText = highlighter.getBestFragment( self.analyzer, 'text', sentence) if highLightText is not None: self.hits.append(highLightText)
def search_phrase(self, term, phrase): print('Phrase search') self.hits = [] index_list = [] parser = QueryParser('text', self.analyzer) query = parser.parse(term) hits = self.searcher.search(query, 1000).scoreDocs if hits is None: return for hit in hits: index = [] doc = self.searcher.doc(hit.doc) text = doc.get("text") phrases = doc.get("phrase") # processing with saved text and phrase terms = text.split() phrases = phrases.split() flag = 1 # this flag is judging for phrase in every target term in text index = [] # index number for searched term, maybe many terms for i in range(len(terms)): if term == terms[i]: index.append(i) if not phrase == phrases[i]: flag = 0 break if flag == 1: self.hits.append(text) index_list.append(index) self.recover_sentence(index_list) hits_copy = self.hits self.hits = [] # add font tags for terms for hit in hits_copy: simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) highLightText = highlighter.getBestFragment( self.analyzer, 'text', hit) if highLightText is not None: self.hits.append(highLightText) return self.hits[:40]
def search(self, term, window=2): self.hits = [] index_list = [] sort_para = term parser = QueryParser('text', self.analyzer) query = parser.parse(term) print(query) # Jump to multi-terms search if there are several words if self.multi_terms(query): self.search_multi_terms(query) return self.hits[:40] hits = self.searcher.search(query, 1000).scoreDocs for hit in hits: index = [] doc = self.searcher.doc(hit.doc) text = doc.get("text") self.hits.append(text) # save indexes of target term in each document terms = text.split() for i in range(len(terms)): if term == terms[i]: index.append(i) index_list.append(index) self.recover_sentence(index_list, window) hits_copy = self.hits self.hits = [] for hit in hits_copy: simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) highLightText = highlighter.getBestFragment( self.analyzer, 'text', hit) if highLightText is not None: self.hits.append(highLightText) print('search over') return self.hits[:40]
def run(searcher, analyzer, command): command_dict = parseCommand(command) seg_list = jieba.cut(command_dict['contents']) command_dict['contents'] = (" ".join(seg_list)) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) scorer = QueryScorer(query) fragmenter = SimpleSpanFragmenter(scorer, 250) simpleHTMLFormatter = SimpleHTMLFormatter("<b>", "</b>") highlighter = Highlighter(simpleHTMLFormatter, scorer) highlighter.setTextFragmenter(fragmenter) results = [] for i, scoreDoc in enumerate(scoreDocs): doc = searcher.doc(scoreDoc.doc) contents = doc.get("contents") if contents: tkStream = analyzer.tokenStream("contents", contents) highlight = highlighter.getBestFragment(tkStream, contents) highlightseg = highlight.split() highlight = ''.join(highlightseg) results.append( (doc.get("title").strip(), doc.get("url"), highlight)) ''' print 'path:', doc.get("path"), \ '\nname:', doc.get("name"), \ '\ntitle:', doc.get("title"), \ "url:",doc.get("url"), \ "\nsite:",doc.get("site"),\ "\ncontent:",highlight,"\n" ''' # print 'explain:', searcher.explain(query, scoreDoc.doc) return results
def work(searcher, analyzer, command, low=None, high=None): global prefixHTML global suffixHTML if command == '': return 0, [] tmp = jieba.cut(command) if command == '': return 0, [] tmp = jieba.cut(command) tmp = ''.join(command) command = tmp query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) result = [] match_count = len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) text = doc.get("contents") content = highlighter.getBestFragment(analyzer, "contents", text) if (low == None) and (high == None): result.append({ "url": doc.get('url'), "Content": content, "pic_url": doc.get('pic_url'), "title": doc.get('title'), "price": doc.get('price'), "description": doc.get('description') }) elif (low != None and high != None): if doc.get('price') >= int(low) and doc.get('price') <= int(high): result.append({ "url": doc.get('url'), "Content": content, "pic_url": doc.get('pic_url'), "title": doc.get('title'), "price": doc.get('price'), "description": doc.get('description') }) elif (low == None and high != None): if doc.get('price') <= int(high): result.append({ "url": doc.get('url'), "Content": content, "pic_url": doc.get('pic_url'), "title": doc.get('title'), "price": doc.get('price'), "description": doc.get('description') }) else: if doc.get('price') >= int(low): result.append({ "url": doc.get('url'), "Content": content, "pic_url": doc.get('pic_url'), "title": doc.get('title'), "price": doc.get('price'), "description": doc.get('description') }) result = sorted(result, key=lambda x: float(x["price"]), reverse=True) return match_count, result
def run(searcher, analyzer, command, urlclick): if command == '': return [] res = firstsearch(searcher, analyzer, command) command = ''.join(my_jieba.cut(command)) command = " ".join(jieba.cut(command, cut_all=True)) if len(res) > 0: scoreDocs = res else: querys = BooleanQuery() for k in tag: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(command) if k == 'taste' or k == 'tech': query.setBoost(0.5) querys.add(query, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 10000).scoreDocs swxc_res = findres(command, scoreDocs, searcher) formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>") formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>") scorer = QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command)) highlighter1 = Highlighter(formatter_name, scorer) highlighter2 = Highlighter( formatter_name, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'content', analyzer).parse(command))) highlighter3 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'ingredient', analyzer).parse(command))) highlighter4 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'taste', analyzer).parse(command))) highlighter5 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'tech', analyzer).parse(command))) highlighter6 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'others', analyzer).parse(command))) fragmenter = SimpleFragmenter(1000) highlighter1.setTextFragmenter(fragmenter) highlighter2.setTextFragmenter(fragmenter) highlighter3.setTextFragmenter(fragmenter) highlighter4.setTextFragmenter(fragmenter) highlighter5.setTextFragmenter(fragmenter) highlighter6.setTextFragmenter(fragmenter) results = [] for scoreDoc in scoreDocs: if (len(scoreDocs) > 200 and len(scoreDocs) * scoreDoc.score < 2) or scoreDoc.score < 0.002: continue doc = searcher.doc(scoreDoc.doc) highlighterContent = highlighter1.getBestFragment( analyzer, 'name', doc.get('name')) highlighterContent2 = highlighter2.getBestFragment( analyzer, 'content', doc.get('content')) highlighterContent3 = highlighter3.getBestFragment( analyzer, 'ingredient', doc.get('ingredient')) highlighterContent4 = highlighter4.getBestFragment( analyzer, 'taste', doc.get('taste')) highlighterContent5 = highlighter5.getBestFragment( analyzer, 'tech', doc.get('tech')) highlighterContent6 = highlighter6.getBestFragment( analyzer, 'others', doc.get('others')) if highlighterContent: highlighterContent = highlighterContent.replace(' ', '') highlighterContent = highlighterContent.replace(',', ' ') else: highlighterContent = doc.get('name').replace(' ', '') if highlighterContent2: highlighterContent2 = highlighterContent2.replace(' ', '') highlighterContent2 = highlighterContent2.replace(',', ' ') else: highlighterContent2 = doc.get('content').replace(' ', '') if highlighterContent3: highlighterContent3 = highlighterContent3.replace(',', '') else: highlighterContent3 = (doc.get('ingredient')).replace(',', '') if highlighterContent4: pass else: highlighterContent4 = doc.get('taste') if highlighterContent5: pass else: highlighterContent5 = doc.get('tech') if highlighterContent6: highlighterContent6 = highlighterContent6.replace(',', '') else: highlighterContent6 = (doc.get('others')).replace(',', '') results.append( (highlighterContent, doc.get('img'), highlighterContent2, highlighterContent3, highlighterContent4, highlighterContent5, highlighterContent6, doc.get('url'), scoreDoc.score)) for i in range(0, min(20, len(results)) - 1): flag = True for j in range(0, min(20, len(results)) - i - 1): if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[ results[j][7]] < urlclick[results[j + 1][7]]: flag = False results[j], results[j + 1] = results[j + 1], results[j] if flag: break return results, swxc_res