def text_search(command): envir.vm_env.attachCurrentThread() command_dict = parseCommand(command, "contents") querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, envir.analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = envir.text_searcher.search(querys, 30).scoreDocs res = [] query_highlight = QueryParser(Version.LUCENE_CURRENT, k, envir.analyzer).parse(command_dict["contents"]) myhighlighter = Highlighter( SimpleHTMLFormatter(), QueryScorer(query_highlight)) myhighlighter.setTextFragmenter(SimpleFragmenter(50)) for scoreDoc in scoreDocs: # find texts which are around the keyword doc = envir.text_searcher.doc(scoreDoc.doc) text = doc.get("contents") key_text = "".join((myhighlighter.getBestFragments( envir.analyzer, "contents", text, 3))) key_text = re.sub('\s', '', key_text) temp = [doc.get("title"), doc.get('url'), key_text] res.append(temp) return res
def search_synonym(self, query): self.hits_dict = {} self.hits = [] similar_terms = self.w2v_model.most_similar(query) parser = QueryParser('text', self.analyzer) query = parser.parse(query) for s_term in similar_terms[:20]: s_term_query = parser.parse(s_term[0]) hits = self.searcher.search(s_term_query, 1000).scoreDocs hit_count = 0 for hit in hits: doc = self.searcher.doc(hit.doc) text = doc.get('text') terms = text.split() sentence = '' for term in terms: sentence += term simpleHTMLFormatter = SimpleHTMLFormatter( prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) highLightText = highlighter.getBestFragment( self.analyzer, 'text', sentence) if highLightText is not None: self.hits.append(highLightText) hit_count += 1 if hit_count >= 3: break if len(self.hits) > 0: self.hits_dict[s_term] = self.hits self.hits = [] return self.hits_dict
def search(self, query_str, restriction=2): self.attachCurrentThread() # 对query进行解析 result_contexts = [] # 根据有没有‘/’判断有没有词性, if '/' in query_str: # 有词性就转到search_phrases result_contexts = self.search_phrases(query_str) else: # 有词性就转到search_terms result_contexts = self.search_terms( QueryParser("context", self.analyzer).parse(query_str)) # 将搜索结果复原为文章返回 self.recover_to_article(query_str, result_contexts, restriction) final_result = [] #进行搜索结果中跟query相关的文段高量处理 simpleHTMLFormatter = SimpleHTMLFormatter(u"<b><font color='red'>", u"</font></b>") for index, recovered_query in enumerate(self.recovered_queries): # 不是直接拿用户输入的query来进行高亮处理,而是通过我们自己处理好的包含了位置约束的query进行高亮处理 recovered_query = recovered_query.replace("/", ",") highlighter = Highlighter( simpleHTMLFormatter, QueryScorer( QueryParser("context", self.analyzer).parse(recovered_query))) highLightText = highlighter.getBestFragment( self.analyzer, 'context', self.recovered_contexts[index]) if highLightText is not None: final_result.append(highLightText) return final_result
def run(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index1" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(analysis(command)) HighlightFormatter = SimpleHTMLFormatter() highlighter = Highlighter(HighlightFormatter, QueryScorer(query)) scoreDocs = searcher.search(query, 500).scoreDocs print "%s total matching documents." % len(scoreDocs) result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get( "name"), 'url:', doc.get("url"), 'title:', doc.get("title") text = doc.get('contents') highLightText = highlighter.getBestFragment(analyzer, "contents", text) if highLightText != None: highLightText = ''.join(highLightText.split(' ')) data = {} data['url'] = doc.get("url") data['title'] = doc.get('title') data['highlight'] = highLightText result.append(data) return result
def highlighting(analyzer,contents,query): formatter=SimpleHTMLFormatter("<b><font color='black'>","</font></b>") highlighter=Highlighter(formatter,QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(30)) tokenStream=analyzer.tokenStream('contents',contents) light_content=highlighter.getBestFragments(tokenStream,contents,3,'...') return light_content
def search(self, q, page=1, duplicates=False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream( "contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight }) del self.searcher totalPages = int(math.ceil(results.getTotalHits() / float(perPage))) return totalPages, docs
def output(self, score_docs, command): ''' Highlight and return the search results. Input: `score_docs`: search results from the index Output: list of documents info found in the index, details includes `title`, `url` and `abstract` ''' query = QueryParser('contents', self.analyzer).parse(command) highlighter = Highlighter(self.formatter, QueryScorer(query)) highlighter.setTextFragmenter( SimpleFragmenter(200)) # Limit the max number of characters results = [] for score_doc in score_docs: doc = self.searcher.doc(score_doc.doc) contents = doc.get('contents') stream = self.analyzer.tokenStream("contents", contents) abstract = highlighter.getBestFragment( stream, contents) # Get the abstract and highlight result = { 'title': doc.get('title'), 'url': doc.get('url'), 'abstract': abstract.replace(' ', '') } results.append(result) return results
def testSimpleHighlighter(self): self.doSearching("Wicked") highlighter = Highlighter(QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) maxNumFragmentsRequired = 2 for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...") print "\t", result
def search_multi_terms(self, query): print('Multiterms search') hits = self.searcher.search(query, 100).scoreDocs for hit in hits: doc = self.searcher.doc(hit.doc) text = doc.get("text") terms = text.split() sentence = '' for term in terms: sentence += term simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) highLightText = highlighter.getBestFragment( self.analyzer, 'text', sentence) if highLightText is not None: self.hits.append(highLightText)
def doStandardHighlights(self): formatter = TestFormatter(self) highlighter = Highlighter(formatter, QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(20)) for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) maxNumFragmentsRequired = 2 fragmentSeparator = "..." tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator) print "\t", result
def search_by(self, **kwargs): command = kwargs.get('command', '') if command == '': return None field = kwargs.get('field') query_type = kwargs.get('query_type', 'chi') if query_type == 'chi': if field in ['token_taglist', 'token_content', 'token_title', 'token_author']: command = ' '.join(jieba.cut_for_search(command)) hlt_analyzer = self.analyzer['ChineseAnalyzer'] else: if field in ['token_content', 'token_title']: command = ' '.join(map(stem, command.split())) hlt_analyzer = self.analyzer['StandardAnalyzer'] analyzer = self.analyzer['SimpleAnalyzer'] num = kwargs.get('num', 50) attrs = kwargs.get('attrs', ['url', 'title']) print "[%s]\tSearching for '%s' in field '%s'" % (query_type, command, field) query = QueryParser(Version.LUCENE_CURRENT, field, analyzer).parse(command) if field in ['token_content', 'token_title']: getAbs = True query_for_highlight = QueryParser(Version.LUCENE_CURRENT, 'content', hlt_analyzer).parse(command) scorer = QueryScorer(query_for_highlight) formatter = SimpleHTMLFormatter("<strong>", "</strong>") # formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>") highlighter = Highlighter(formatter, scorer) fragmenter = SimpleFragmenter(20) highlighter.setTextFragmenter(fragmenter) else: getAbs = False scoreDocs = self.searcher.search(query, num).scoreDocs print "%s total matching documents." % len(scoreDocs) articles = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) article = {} for attr in attrs: article[attr] = doc.get(attr) if getAbs is True: content = doc.get('content') tokenStream = hlt_analyzer.tokenStream("content", StringReader(content)) article['abstract'] = highlighter.getBestFragments(tokenStream, content, 3, "...") articles.append(article) return articles
def search_phrase(self, term, phrase): print('Phrase search') self.hits = [] index_list = [] parser = QueryParser('text', self.analyzer) query = parser.parse(term) hits = self.searcher.search(query, 1000).scoreDocs if hits is None: return for hit in hits: index = [] doc = self.searcher.doc(hit.doc) text = doc.get("text") phrases = doc.get("phrase") # processing with saved text and phrase terms = text.split() phrases = phrases.split() flag = 1 # this flag is judging for phrase in every target term in text index = [] # index number for searched term, maybe many terms for i in range(len(terms)): if term == terms[i]: index.append(i) if not phrase == phrases[i]: flag = 0 break if flag == 1: self.hits.append(text) index_list.append(index) self.recover_sentence(index_list) hits_copy = self.hits self.hits = [] # add font tags for terms for hit in hits_copy: simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) highLightText = highlighter.getBestFragment( self.analyzer, 'text', hit) if highLightText is not None: self.hits.append(highLightText) return self.hits[:40]
def search(self, terms, n_hits=5): """ Run search query. """ # TODO: support date range queries # build query parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer) #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier query = MultiFieldQueryParser.parse( parser, terms) # https://stackoverflow.com/a/26853987/130164 # create a highlighter highlighter = Highlighter(SimpleHTMLFormatter('*', '*'), QueryScorer(query)) # execute search for top N hits return [ self._process_search_result(result, highlighter) for result in self.searcher.search(query, n_hits).scoreDocs ]
def get_highlighted_hits(self): extracted_fragments = [] scorer = QueryScorer(self.query) fragmenter = SimpleSpanFragmenter(scorer, 10) highlighter = Highlighter(self.formatter, scorer) highlighter.setTextFragmenter(fragmenter) for hit in self.hits.scoreDocs: document = self.searcher.doc(hit.doc) stream = TokenSources.getAnyTokenStream(self.index_reader, hit.doc, 'contents', self.analyzer) best_fragments = highlighter.getBestFragments( stream, document.get('contents'), 10) for fragment in best_fragments: print('fragment: ', fragment) extracted_fragments.append((hit.doc, best_fragments)) return extracted_fragments
def search(self, term, window=2): self.hits = [] index_list = [] sort_para = term parser = QueryParser('text', self.analyzer) query = parser.parse(term) print(query) # Jump to multi-terms search if there are several words if self.multi_terms(query): self.search_multi_terms(query) return self.hits[:40] hits = self.searcher.search(query, 1000).scoreDocs for hit in hits: index = [] doc = self.searcher.doc(hit.doc) text = doc.get("text") self.hits.append(text) # save indexes of target term in each document terms = text.split() for i in range(len(terms)): if term == terms[i]: index.append(i) index_list.append(index) self.recover_sentence(index_list, window) hits_copy = self.hits self.hits = [] for hit in hits_copy: simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) highLightText = highlighter.getBestFragment( self.analyzer, 'text', hit) if highLightText is not None: self.hits.append(highLightText) print('search over') return self.hits[:40]
def run(searcher, analyzer, command): command_dict = parseCommand(command) seg_list = jieba.cut(command_dict['contents']) command_dict['contents'] = (" ".join(seg_list)) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) scorer = QueryScorer(query) fragmenter = SimpleSpanFragmenter(scorer, 250) simpleHTMLFormatter = SimpleHTMLFormatter("<b>", "</b>") highlighter = Highlighter(simpleHTMLFormatter, scorer) highlighter.setTextFragmenter(fragmenter) results = [] for i, scoreDoc in enumerate(scoreDocs): doc = searcher.doc(scoreDoc.doc) contents = doc.get("contents") if contents: tkStream = analyzer.tokenStream("contents", contents) highlight = highlighter.getBestFragment(tkStream, contents) highlightseg = highlight.split() highlight = ''.join(highlightseg) results.append( (doc.get("title").strip(), doc.get("url"), highlight)) ''' print 'path:', doc.get("path"), \ '\nname:', doc.get("name"), \ '\ntitle:', doc.get("title"), \ "url:",doc.get("url"), \ "\nsite:",doc.get("site"),\ "\ncontent:",highlight,"\n" ''' # print 'explain:', searcher.explain(query, scoreDoc.doc) return results
def get_lm_doc_snippets(query, searcher, qparser, analyzer, preprocessor, topk=10): """ Fetches the topk document snippets given query, searcher and qparser and returns (did, text) pair list :param query: :param searcher: :param qparser: :param topk: :return: """ dids_text = [] query = qparser.parse(query) scoreDocs = searcher.search(query, topk).scoreDocs highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(100)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) did = doc.get("id") text = doc.get("raw") token_stream = analyzer.tokenStream("raw", StringReader(text)) result = highlighter.getBestFragments(token_stream, text, 4, "... ") text = get_parsed_text(result) text = preprocess_text(preprocessor, [text]) text = " ".join(text) dids_text.append((did, text)) return dids_text
def work(searcher, analyzer, command, low=None, high=None): global prefixHTML global suffixHTML if command == '': return 0, [] tmp = jieba.cut(command) if command == '': return 0, [] tmp = jieba.cut(command) tmp = ''.join(command) command = tmp query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) result = [] match_count = len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) text = doc.get("contents") content = highlighter.getBestFragment(analyzer, "contents", text) if (low == None) and (high == None): result.append({ "url": doc.get('url'), "Content": content, "pic_url": doc.get('pic_url'), "title": doc.get('title'), "price": doc.get('price'), "description": doc.get('description') }) elif (low != None and high != None): if doc.get('price') >= int(low) and doc.get('price') <= int(high): result.append({ "url": doc.get('url'), "Content": content, "pic_url": doc.get('pic_url'), "title": doc.get('title'), "price": doc.get('price'), "description": doc.get('description') }) elif (low == None and high != None): if doc.get('price') <= int(high): result.append({ "url": doc.get('url'), "Content": content, "pic_url": doc.get('pic_url'), "title": doc.get('title'), "price": doc.get('price'), "description": doc.get('description') }) else: if doc.get('price') >= int(low): result.append({ "url": doc.get('url'), "Content": content, "pic_url": doc.get('pic_url'), "title": doc.get('title'), "price": doc.get('price'), "description": doc.get('description') }) result = sorted(result, key=lambda x: float(x["price"]), reverse=True) return match_count, result
def run(searcher, analyzer, command, urlclick): if command == '': return [] res = firstsearch(searcher, analyzer, command) command = ''.join(my_jieba.cut(command)) command = " ".join(jieba.cut(command, cut_all=True)) if len(res) > 0: scoreDocs = res else: querys = BooleanQuery() for k in tag: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(command) if k == 'taste' or k == 'tech': query.setBoost(0.5) querys.add(query, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 10000).scoreDocs swxc_res = findres(command, scoreDocs, searcher) formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>") formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>") scorer = QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command)) highlighter1 = Highlighter(formatter_name, scorer) highlighter2 = Highlighter( formatter_name, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'content', analyzer).parse(command))) highlighter3 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'ingredient', analyzer).parse(command))) highlighter4 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'taste', analyzer).parse(command))) highlighter5 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'tech', analyzer).parse(command))) highlighter6 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'others', analyzer).parse(command))) fragmenter = SimpleFragmenter(1000) highlighter1.setTextFragmenter(fragmenter) highlighter2.setTextFragmenter(fragmenter) highlighter3.setTextFragmenter(fragmenter) highlighter4.setTextFragmenter(fragmenter) highlighter5.setTextFragmenter(fragmenter) highlighter6.setTextFragmenter(fragmenter) results = [] for scoreDoc in scoreDocs: if (len(scoreDocs) > 200 and len(scoreDocs) * scoreDoc.score < 2) or scoreDoc.score < 0.002: continue doc = searcher.doc(scoreDoc.doc) highlighterContent = highlighter1.getBestFragment( analyzer, 'name', doc.get('name')) highlighterContent2 = highlighter2.getBestFragment( analyzer, 'content', doc.get('content')) highlighterContent3 = highlighter3.getBestFragment( analyzer, 'ingredient', doc.get('ingredient')) highlighterContent4 = highlighter4.getBestFragment( analyzer, 'taste', doc.get('taste')) highlighterContent5 = highlighter5.getBestFragment( analyzer, 'tech', doc.get('tech')) highlighterContent6 = highlighter6.getBestFragment( analyzer, 'others', doc.get('others')) if highlighterContent: highlighterContent = highlighterContent.replace(' ', '') highlighterContent = highlighterContent.replace(',', ' ') else: highlighterContent = doc.get('name').replace(' ', '') if highlighterContent2: highlighterContent2 = highlighterContent2.replace(' ', '') highlighterContent2 = highlighterContent2.replace(',', ' ') else: highlighterContent2 = doc.get('content').replace(' ', '') if highlighterContent3: highlighterContent3 = highlighterContent3.replace(',', '') else: highlighterContent3 = (doc.get('ingredient')).replace(',', '') if highlighterContent4: pass else: highlighterContent4 = doc.get('taste') if highlighterContent5: pass else: highlighterContent5 = doc.get('tech') if highlighterContent6: highlighterContent6 = highlighterContent6.replace(',', '') else: highlighterContent6 = (doc.get('others')).replace(',', '') results.append( (highlighterContent, doc.get('img'), highlighterContent2, highlighterContent3, highlighterContent4, highlighterContent5, highlighterContent6, doc.get('url'), scoreDoc.score)) for i in range(0, min(20, len(results)) - 1): flag = True for j in range(0, min(20, len(results)) - i - 1): if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[ results[j][7]] < urlclick[results[j + 1][7]]: flag = False results[j], results[j + 1] = results[j + 1], results[j] if flag: break return results, swxc_res
def build_highlighter(parsed_query): scorer = QueryScorer(parsed_query, 'content') highlighter = Highlighter(SimpleHTMLFormatter(), scorer) fragmenter = SimpleSpanFragmenter(scorer, FRAGMENT_SIZE) highlighter.setTextFragmenter(fragmenter) return highlighter
def superSearch(command, command_dict, urlclick): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File('index2.3')) print "run super search..." searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) command = ' '.join(jieba.cut_for_search(command)) querys = BooleanQuery() if command: query = QueryParser(Version.LUCENE_CURRENT, 'nameforsearch', analyzer).parse(command) querys.add(query, BooleanClause.Occur.SHOULD) for k, v in (command_dict[0]).items(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) query.setBoost(0.1) querys.add(query, BooleanClause.Occur.MUST) for k, v in (command_dict[1]).items(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST_NOT) scoreDocs = searcher.search(querys, 10000).scoreDocs swxc_res = findres(command + ' ' + command_dict[0].get("ingredient", ''), scoreDocs, searcher) formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>") formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>") if command: scorer = QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command)) highlighters = [Highlighter(formatter_name, scorer)] else: highlighters = [''] if command_dict[0].get('ingredient'): highlighters.append( Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'ingredient', analyzer).parse( command_dict[0]['ingredient'])))) else: highlighters.append('') if command_dict[0].get('taste'): highlighters.append( Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'taste', analyzer).parse(command_dict[0]['taste'])))) else: highlighters.append('') if command_dict[0].get('tech'): highlighters.append( Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'tech', analyzer).parse(command_dict[0]['tech'])))) else: highlighters.append('') fragmenter = SimpleFragmenter(1000) for h in highlighters: if h: h.setTextFragmenter(fragmenter) results = [] for scoreDoc in scoreDocs: if (scoreDoc.score * len(scoreDocs) < 200 and len(scoreDocs) > 200) or scoreDoc.score < 0.1: continue doc = searcher.doc(scoreDoc.doc) if command: highlighterContent = highlighters[0].getBestFragment( analyzer, 'name', doc.get('name')) else: highlighterContent = '' if highlighters[1]: highlighterContent2 = highlighters[1].getBestFragment( analyzer, 'ingredient', doc.get('ingredient')) else: highlighterContent2 = '' if highlighters[2]: highlighterContent3 = highlighters[2].getBestFragment( analyzer, 'taste', doc.get('taste')) else: highlighterContent3 = '' if highlighters[3]: highlighterContent4 = highlighters[3].getBestFragment( analyzer, 'tech', doc.get('tech')) else: highlighterContent4 = '' if highlighterContent: highlighterContent = highlighterContent.replace(' ', '') highlighterContent = highlighterContent.replace(',', ' ') else: highlighterContent = doc.get('name').replace(' ', '') if highlighterContent2: highlighterContent2 = highlighterContent2.replace(',', '') else: highlighterContent2 = (doc.get('ingredient')).replace(',', '') if highlighterContent3: pass else: highlighterContent3 = doc.get('taste') if highlighterContent4: pass else: highlighterContent4 = doc.get('tech') results.append( (highlighterContent, doc.get('img'), doc.get('content').replace(' ', ''), highlighterContent2, highlighterContent3, highlighterContent4, doc.get('others').replace(',', ''), doc.get('url'), scoreDoc.score)) for i in range(0, min(20, len(results)) - 1): flag = True for j in range(0, min(20, len(results)) - i - 1): if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[ results[j][7]] < urlclick[results[j + 1][7]]: flag = False results[j], results[j + 1] = results[j + 1], results[j] if flag: break return results, swxc_res
def run(self, writer=None, analyzer=None): if writer is None: writer = self.writer if analyzer is None: analyzer = self.analyzer searcher = IndexSearcher(DirectoryReader.open(\ SimpleFSDirectory.open(File(self.store_dir)))) while True: print() print("Hit enter with no input to quit.") command = input("Query:") if command == '': return print("Searching for:", command) query = QueryParser(Version.LUCENE_43, "contents", analyzer).parse(command) # We'll just show the top 10 matching documents for now scoreDocs = searcher.search(query, 10).scoreDocs print("%s total matching documents." % len(scoreDocs)) # Highlight the matching text in red highlighter = Highlighter( SimpleHTMLFormatter('<b><font color\ ="red">', '</font></b>'), QueryScorer(query)) # Using NullFragmenter since we still want to see # the whole document highlighter.setTextFragmenter(NullFragmenter()) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream( "contents", StringReader(doc.get("contents"))) # arg 3: the maximum number of fragments # arg 4: the separator used to intersperse the # document fragments (typically "...") # arg 3 and 4 don't really matter with NullFragmenter result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.hits_dir + '/' + doc.get("name"), 'w+') file_handler.write(result) # create hit fragments, if we want to show them # arg 1: fragment size highlighter.setTextFragmenter(SimpleFragmenter(200)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream( "contents", StringReader(doc.get("contents"))) result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.frags_dir + '/' + doc.get("name"), 'w+') file_handler.write(result)