def highlighting(analyzer,contents,query): formatter=SimpleHTMLFormatter("<b><font color='black'>","</font></b>") highlighter=Highlighter(formatter,QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(30)) tokenStream=analyzer.tokenStream('contents',contents) light_content=highlighter.getBestFragments(tokenStream,contents,3,'...') return light_content
def search(self, q, page = 1, duplicates = False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight} ) del self.searcher totalPages = int(math.ceil(results.getTotalHits()/float(perPage))) return totalPages, docs
def text_search(command): envir.vm_env.attachCurrentThread() command_dict = parseCommand(command, "contents") querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, envir.analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = envir.text_searcher.search(querys, 30).scoreDocs res = [] query_highlight = QueryParser(Version.LUCENE_CURRENT, k, envir.analyzer).parse(command_dict["contents"]) myhighlighter = Highlighter( SimpleHTMLFormatter(), QueryScorer(query_highlight)) myhighlighter.setTextFragmenter(SimpleFragmenter(50)) for scoreDoc in scoreDocs: # find texts which are around the keyword doc = envir.text_searcher.doc(scoreDoc.doc) text = doc.get("contents") key_text = "".join((myhighlighter.getBestFragments( envir.analyzer, "contents", text, 3))) key_text = re.sub('\s', '', key_text) temp = [doc.get("title"), doc.get('url'), key_text] res.append(temp) return res
def search(self, q, page=1, duplicates=False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream( "contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight }) del self.searcher totalPages = int(math.ceil(results.getTotalHits() / float(perPage))) return totalPages, docs
def lucene_search(query, MAX, showHighlight): dir = os.getcwd() lucene.initVM() index_dir = SimpleFSDirectory(File(dir)) index_reader = DirectoryReader.open(index_dir) lucene_searcher = IndexSearcher(index_reader) lucene_analyzer = StandardAnalyzer(Version.LUCENE_48) my_query = QueryParser(Version.LUCENE_48, "text", lucene_analyzer).parse(query) #We can define the MAX number of results (default 10) total_hits = lucene_searcher.search(my_query, MAX) query_scorer = QueryScorer(my_query) formatter = SimpleHTMLFormatter() highlighter = Highlighter(formatter, query_scorer) # Set the fragment size. We break text in to fragment of 50 characters fragmenter = SimpleSpanFragmenter(query_scorer, 50) highlighter.setTextFragmenter(fragmenter) print "Only shows at most %s documents" % MAX if showHighlight: print "<br>" for hit in total_hits.scoreDocs: doc = lucene_searcher.doc(hit.doc) text = doc.get("text") ts = lucene_analyzer.tokenStream("text", StringReader(text)) if showHighlight: print "<p>" print doc.get("title") if showHighlight: print "<br>" print highlighter.getBestFragments(ts, text, 3, "...") print "</p>"
def testSimpleHighlighter(self): self.doSearching("Wicked") highlighter = Highlighter(QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) maxNumFragmentsRequired = 2 for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...") print "\t", result
def doStandardHighlights(self): formatter = TestFormatter(self) highlighter = Highlighter(formatter, QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(20)) for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) maxNumFragmentsRequired = 2 fragmentSeparator = "..." tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator) print "\t", result
def search_by(self, **kwargs): command = kwargs.get('command', '') if command == '': return None field = kwargs.get('field') query_type = kwargs.get('query_type', 'chi') if query_type == 'chi': if field in ['token_taglist', 'token_content', 'token_title', 'token_author']: command = ' '.join(jieba.cut_for_search(command)) hlt_analyzer = self.analyzer['ChineseAnalyzer'] else: if field in ['token_content', 'token_title']: command = ' '.join(map(stem, command.split())) hlt_analyzer = self.analyzer['StandardAnalyzer'] analyzer = self.analyzer['SimpleAnalyzer'] num = kwargs.get('num', 50) attrs = kwargs.get('attrs', ['url', 'title']) print "[%s]\tSearching for '%s' in field '%s'" % (query_type, command, field) query = QueryParser(Version.LUCENE_CURRENT, field, analyzer).parse(command) if field in ['token_content', 'token_title']: getAbs = True query_for_highlight = QueryParser(Version.LUCENE_CURRENT, 'content', hlt_analyzer).parse(command) scorer = QueryScorer(query_for_highlight) formatter = SimpleHTMLFormatter("<strong>", "</strong>") # formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>") highlighter = Highlighter(formatter, scorer) fragmenter = SimpleFragmenter(20) highlighter.setTextFragmenter(fragmenter) else: getAbs = False scoreDocs = self.searcher.search(query, num).scoreDocs print "%s total matching documents." % len(scoreDocs) articles = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) article = {} for attr in attrs: article[attr] = doc.get(attr) if getAbs is True: content = doc.get('content') tokenStream = hlt_analyzer.tokenStream("content", StringReader(content)) article['abstract'] = highlighter.getBestFragments(tokenStream, content, 3, "...") articles.append(article) return articles
def get_highlighted_hits(self): extracted_fragments = [] scorer = QueryScorer(self.query) fragmenter = SimpleSpanFragmenter(scorer, 10) highlighter = Highlighter(self.formatter, scorer) highlighter.setTextFragmenter(fragmenter) for hit in self.hits.scoreDocs: document = self.searcher.doc(hit.doc) stream = TokenSources.getAnyTokenStream(self.index_reader, hit.doc, 'contents', self.analyzer) best_fragments = highlighter.getBestFragments( stream, document.get('contents'), 10) for fragment in best_fragments: print('fragment: ', fragment) extracted_fragments.append((hit.doc, best_fragments)) return extracted_fragments
def get_lm_doc_snippets(query, searcher, qparser, analyzer, preprocessor, topk=10): """ Fetches the topk document snippets given query, searcher and qparser and returns (did, text) pair list :param query: :param searcher: :param qparser: :param topk: :return: """ dids_text = [] query = qparser.parse(query) scoreDocs = searcher.search(query, topk).scoreDocs highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(100)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) did = doc.get("id") text = doc.get("raw") token_stream = analyzer.tokenStream("raw", StringReader(text)) result = highlighter.getBestFragments(token_stream, text, 4, "... ") text = get_parsed_text(result) text = preprocess_text(preprocessor, [text]) text = " ".join(text) dids_text.append((did, text)) return dids_text
def run(self, writer=None, analyzer=None): if writer is None: writer = self.writer if analyzer is None: analyzer = self.analyzer searcher = IndexSearcher(DirectoryReader.open(\ SimpleFSDirectory.open(File(self.store_dir)))) while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print "Searching for:", command query = QueryParser(Version.LUCENE_43, "contents", analyzer).parse(command) # We'll just show the top 10 matching documents for now scoreDocs = searcher.search(query, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) # Highlight the matching text in red highlighter = Highlighter(SimpleHTMLFormatter('<b><font color\ ="red">', '</font></b>'), QueryScorer(query)) # Using NullFragmenter since we still want to see # the whole document highlighter.setTextFragmenter(NullFragmenter()) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream("contents", StringReader(doc.get("contents"))) # arg 3: the maximum number of fragments # arg 4: the separator used to intersperse the # document fragments (typically "...") # arg 3 and 4 don't really matter with NullFragmenter result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.hits_dir + '/' + doc.get("name"), 'w+') file_handler.write(result) # create hit fragments, if we want to show them # arg 1: fragment size highlighter.setTextFragmenter(SimpleFragmenter(200)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream("contents", StringReader(doc.get("contents"))) result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.frags_dir + '/' + doc.get("name"), 'w+') file_handler.write(result)
def _getHits(self): reader = IndexReader.open(self.index) searcher = IndexSearcher(reader) # Sortierung nach Band- und Eintragsnummer (4: Wert als Integer behandeln) sortDict = { "ta": (("volume", SortField.Type.INT), ("number", SortField.Type.INT)), "year": (("year-start", SortField.Type.INT), ("year-end", SortField.Type.INT)), "author-title": (("author-sort", SortField.Type.STRING), ("title-sort", SortField.Type.STRING)), "title": (("title-sort", Locale.GERMAN),), "author": (("author-sort", Locale.GERMAN),), } sortFields = [] reverse = not self.ascending for name in self.orderby: for fieldName, typeNum in sortDict.get(name, []): sortFields.append(SortField(fieldName, typeNum, reverse)) if len(sortFields) == 0: sortFields = [SortField("volume", SortField.Type.INT), SortField("number", SortField.Type.INT)] sort = Sort(sortFields) topDocs = searcher.search(self.query, None, 80000, sort) hits = topDocs.scoreDocs self.hits = hits self.searcher = searcher lang = translation.get_language() if lang != "de": lang = "en" facets = {"author": {}, "pubtype": {}, "category-%s" % lang: {}} # Highlighting highlighter = Highlighter(SimpleHTMLFormatter('<span class="highlight">', '</span>'), QueryScorer(self.query)) hitObjects = [] fields = {} for hit in hits: doc = searcher.doc(hit.doc) # print unicode(doc) fields["score"] = hit.score fields["volume"] = doc["volume"] fields["number"] = doc["number"] fields["id"] = doc["id"] fields["title"] = doc["title"] fields["author"] = doc["author"] fields["authors"] = [field.stringValue() for field in doc.getFields("author")] for author in fields["authors"]: # XXX facets["author"][author] = facets["author"].get(author, 0) + 1 # XXX fields["categories"] = [field.stringValue() for field in doc.getFields("category-%s" % lang)] for cat in fields["categories"]: facets["category-%s" % lang][cat] = facets["category-%s" % lang].get(cat, 0) + 1 maxNumFragmentsRequired = 2 fragmentSeparator = "..."; pubtype = doc["pubtype"] fields["pubtype"] = pubtype facets["pubtype"][pubtype] = facets["pubtype"].get(pubtype, 0) + 1 fields["city"] = doc["city"] fields["year"] = doc["year-start"] if fields["year"] and doc["year-end"] and doc["year-end"] != fields["year"]: fields["year"] += " - " + doc["year-end"] highlightFields = ("title", "author", "city", "year", "category") if "freetext" in self.criteria: for fieldName in highlightFields: try: tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName])) newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired, fragmentSeparator) if len(newVal) > 0: # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal) fields[fieldName] = newVal except: continue for fieldName in highlightFields: if fieldName in self.criteria or fieldName + "-de" in self.criteria or fieldName + "-en" in self.criteria: try: tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName])) newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired, fragmentSeparator) if len(newVal) > 0: # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal) fields[fieldName] = newVal except: continue """if "author" in self.criteria: try: tokenStream = self.analyzer.tokenStream("author", lucene.StringReader(fields["author"])) fields["author"] = highlighter.getBestFragments(tokenStream, fields["author"], maxNumFragmentsRequired, fragmentSeparator) except: pass""" hitObjects.append( Hit(fields["id"], fields["volume"], fields["number"], fields["title"], fields["author"], fields["city"], fields["year"], fields["categories"], fields["pubtype"], fields["score"])) facetsToDelete = [] for facet in facets: if len(facets[facet]) < 2: facetsToDelete.append(facet) continue values = sorted(facets[facet].items(), key=itemgetter(0)) values = sorted(values, key=itemgetter(1), reverse=True) facets[facet] = values[:25] for facet in facetsToDelete: del facets[facet] self.facets = facets reader.close() self.hitObjects = hitObjects return hitObjects
def run(self, writer=None, analyzer=None): if writer is None: writer = self.writer if analyzer is None: analyzer = self.analyzer searcher = IndexSearcher(DirectoryReader.open(\ SimpleFSDirectory.open(File(self.store_dir)))) while True: print() print("Hit enter with no input to quit.") command = input("Query:") if command == '': return print("Searching for:", command) query = QueryParser(Version.LUCENE_43, "contents", analyzer).parse(command) # We'll just show the top 10 matching documents for now scoreDocs = searcher.search(query, 10).scoreDocs print("%s total matching documents." % len(scoreDocs)) # Highlight the matching text in red highlighter = Highlighter( SimpleHTMLFormatter('<b><font color\ ="red">', '</font></b>'), QueryScorer(query)) # Using NullFragmenter since we still want to see # the whole document highlighter.setTextFragmenter(NullFragmenter()) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream( "contents", StringReader(doc.get("contents"))) # arg 3: the maximum number of fragments # arg 4: the separator used to intersperse the # document fragments (typically "...") # arg 3 and 4 don't really matter with NullFragmenter result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.hits_dir + '/' + doc.get("name"), 'w+') file_handler.write(result) # create hit fragments, if we want to show them # arg 1: fragment size highlighter.setTextFragmenter(SimpleFragmenter(200)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream( "contents", StringReader(doc.get("contents"))) result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.frags_dir + '/' + doc.get("name"), 'w+') file_handler.write(result)