Exemplo n.º 1
0
def highlighting(analyzer,contents,query):
    formatter=SimpleHTMLFormatter("<b><font color='black'>","</font></b>")
    highlighter=Highlighter(formatter,QueryScorer(query))
    highlighter.setTextFragmenter(SimpleFragmenter(30))
    tokenStream=analyzer.tokenStream('contents',contents)
    light_content=highlighter.getBestFragments(tokenStream,contents,3,'...')
    return light_content 
Exemplo n.º 2
0
    def search(self, q, page = 1, duplicates = False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)
        
        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...")
            
            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight}
            )

        del self.searcher
        
        totalPages = int(math.ceil(results.getTotalHits()/float(perPage)))

        return totalPages, docs
def text_search(command):
    envir.vm_env.attachCurrentThread()
    command_dict = parseCommand(command, "contents")
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k,
                            envir.analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = envir.text_searcher.search(querys, 30).scoreDocs
    res = []

    query_highlight = QueryParser(Version.LUCENE_CURRENT, k,
                                  envir.analyzer).parse(command_dict["contents"])
    myhighlighter = Highlighter(
        SimpleHTMLFormatter(), QueryScorer(query_highlight))
    myhighlighter.setTextFragmenter(SimpleFragmenter(50))
    for scoreDoc in scoreDocs:
        # find texts which are around the keyword
        doc = envir.text_searcher.doc(scoreDoc.doc)
        text = doc.get("contents")
        key_text = "".join((myhighlighter.getBestFragments(
            envir.analyzer, "contents", text, 3)))
        key_text = re.sub('\s', '', key_text)
        temp = [doc.get("title"), doc.get('url'), key_text]
        res.append(temp)
    return res
Exemplo n.º 4
0
    def search(self, q, page=1, duplicates=False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)

        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream(
                "contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream,
                                                     doc['contents'], 3, "...")

            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight
            })

        del self.searcher

        totalPages = int(math.ceil(results.getTotalHits() / float(perPage)))

        return totalPages, docs
Exemplo n.º 5
0
def lucene_search(query, MAX, showHighlight):
    dir = os.getcwd()
    lucene.initVM()
    index_dir = SimpleFSDirectory(File(dir))
    index_reader = DirectoryReader.open(index_dir)
    lucene_searcher = IndexSearcher(index_reader)
    lucene_analyzer = StandardAnalyzer(Version.LUCENE_48)
    my_query = QueryParser(Version.LUCENE_48, "text",
                           lucene_analyzer).parse(query)
    #We can define the MAX number of results (default 10)
    total_hits = lucene_searcher.search(my_query, MAX)

    query_scorer = QueryScorer(my_query)
    formatter = SimpleHTMLFormatter()
    highlighter = Highlighter(formatter, query_scorer)
    # Set the fragment size. We break text in to fragment of 50 characters
    fragmenter = SimpleSpanFragmenter(query_scorer, 50)
    highlighter.setTextFragmenter(fragmenter)

    print "Only shows at most %s documents" % MAX
    if showHighlight:
        print "<br>"

    for hit in total_hits.scoreDocs:

        doc = lucene_searcher.doc(hit.doc)
        text = doc.get("text")
        ts = lucene_analyzer.tokenStream("text", StringReader(text))
        
        if showHighlight:
            print "<p>"

        print doc.get("title")

        if showHighlight:
            print "<br>"
            print highlighter.getBestFragments(ts, text, 3, "...")
            print "</p>"
Exemplo n.º 6
0
    def testSimpleHighlighter(self):

        self.doSearching("Wicked")
        highlighter = Highlighter(QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))
        maxNumFragmentsRequired = 2

        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  "...")
            print "\t", result
    def testSimpleHighlighter(self):

        self.doSearching("Wicked")
        highlighter = Highlighter(QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))
        maxNumFragmentsRequired = 2

        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  "...")
            print "\t", result
Exemplo n.º 8
0
    def doStandardHighlights(self):

        formatter = TestFormatter(self)

        highlighter = Highlighter(formatter, QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(20))
        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            maxNumFragmentsRequired = 2
            fragmentSeparator = "..."
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  fragmentSeparator)
            print "\t", result
    def doStandardHighlights(self):
        
        formatter = TestFormatter(self)
        
        highlighter = Highlighter(formatter, QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(20))
        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            maxNumFragmentsRequired = 2
            fragmentSeparator = "..."
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream,
                                                  text,
                                                  maxNumFragmentsRequired,
                                                  fragmentSeparator)
            print "\t", result
Exemplo n.º 10
0
 def search_by(self, **kwargs):
     command = kwargs.get('command', '')
     if command == '':
         return None
     field = kwargs.get('field')
     query_type = kwargs.get('query_type', 'chi')
     if query_type == 'chi':
         if field in ['token_taglist', 'token_content', 'token_title', 'token_author']:
             command = ' '.join(jieba.cut_for_search(command))
         hlt_analyzer = self.analyzer['ChineseAnalyzer']
     else:
         if field in ['token_content', 'token_title']:
             command = ' '.join(map(stem, command.split()))
         hlt_analyzer = self.analyzer['StandardAnalyzer']
     analyzer = self.analyzer['SimpleAnalyzer']
     num = kwargs.get('num', 50)
     attrs = kwargs.get('attrs', ['url', 'title'])
     print "[%s]\tSearching for '%s' in field '%s'" % (query_type, command, field)
     query = QueryParser(Version.LUCENE_CURRENT, field, analyzer).parse(command)
     if field in ['token_content', 'token_title']:
         getAbs = True
         query_for_highlight = QueryParser(Version.LUCENE_CURRENT, 'content', hlt_analyzer).parse(command)
         scorer = QueryScorer(query_for_highlight)
         formatter = SimpleHTMLFormatter("<strong>", "</strong>")
         # formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>")
         highlighter = Highlighter(formatter, scorer)
         fragmenter = SimpleFragmenter(20)
         highlighter.setTextFragmenter(fragmenter)
     else:
         getAbs = False
     scoreDocs = self.searcher.search(query, num).scoreDocs
     print "%s total matching documents." % len(scoreDocs)
     articles = []
     for scoreDoc in scoreDocs:
         doc = self.searcher.doc(scoreDoc.doc)
         article = {}
         for attr in attrs:
             article[attr] = doc.get(attr)
         if getAbs is True:
             content = doc.get('content')
             tokenStream = hlt_analyzer.tokenStream("content", StringReader(content))
             article['abstract'] = highlighter.getBestFragments(tokenStream, content, 3, "...")
         articles.append(article)
     return articles
Exemplo n.º 11
0
    def get_highlighted_hits(self):
        extracted_fragments = []

        scorer = QueryScorer(self.query)
        fragmenter = SimpleSpanFragmenter(scorer, 10)
        highlighter = Highlighter(self.formatter, scorer)
        highlighter.setTextFragmenter(fragmenter)

        for hit in self.hits.scoreDocs:
            document = self.searcher.doc(hit.doc)
            stream = TokenSources.getAnyTokenStream(self.index_reader, hit.doc,
                                                    'contents', self.analyzer)
            best_fragments = highlighter.getBestFragments(
                stream, document.get('contents'), 10)

            for fragment in best_fragments:
                print('fragment: ', fragment)

            extracted_fragments.append((hit.doc, best_fragments))

        return extracted_fragments
Exemplo n.º 12
0
def get_lm_doc_snippets(query,
                        searcher,
                        qparser,
                        analyzer,
                        preprocessor,
                        topk=10):
    """
    Fetches the topk document snippets given query, searcher and qparser and
    returns (did, text) pair list
    :param query:
    :param searcher:
    :param qparser:
    :param topk:
    :return:
    """

    dids_text = []

    query = qparser.parse(query)
    scoreDocs = searcher.search(query, topk).scoreDocs

    highlighter = Highlighter(QueryScorer(query))
    highlighter.setTextFragmenter(SimpleFragmenter(100))

    for scoreDoc in scoreDocs:

        doc = searcher.doc(scoreDoc.doc)
        did = doc.get("id")

        text = doc.get("raw")
        token_stream = analyzer.tokenStream("raw", StringReader(text))
        result = highlighter.getBestFragments(token_stream, text, 4, "... ")
        text = get_parsed_text(result)
        text = preprocess_text(preprocessor, [text])
        text = " ".join(text)

        dids_text.append((did, text))

    return dids_text
Exemplo n.º 13
0
    def run(self, writer=None, analyzer=None):

        if writer is None:
            writer = self.writer

        if analyzer is None:
            analyzer = self.analyzer

        searcher = IndexSearcher(DirectoryReader.open(\
        SimpleFSDirectory.open(File(self.store_dir))))
        while True:
            print
            print "Hit enter with no input to quit."
            command = raw_input("Query:")
            if command == '':
                return

            print "Searching for:", command
            query = QueryParser(Version.LUCENE_43, "contents",
                analyzer).parse(command)

            # We'll just show the top 10 matching documents for now
            scoreDocs = searcher.search(query, 10).scoreDocs
            print "%s total matching documents." % len(scoreDocs)

            # Highlight the matching text in red
            highlighter = Highlighter(SimpleHTMLFormatter('<b><font color\
            ="red">', '</font></b>'), QueryScorer(query))

            # Using NullFragmenter since we still want to see
            # the whole document
            highlighter.setTextFragmenter(NullFragmenter())

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream("contents",
                    StringReader(doc.get("contents")))

                # arg 3: the maximum number of fragments
                # arg 4: the separator used to intersperse the
                # document fragments (typically "...")
                # arg 3 and 4 don't really matter with NullFragmenter
                result = highlighter.getBestFragments(tokenStream,
                    doc.get("contents"), 2, "...")

                if len(result) > 10:
                    file_handler = open(self.hits_dir + '/' + doc.get("name"),
                        'w+')
                    file_handler.write(result)

            # create hit fragments, if we want to show them
            # arg 1: fragment size
            highlighter.setTextFragmenter(SimpleFragmenter(200))

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream("contents",
                    StringReader(doc.get("contents")))

                result = highlighter.getBestFragments(tokenStream,
                    doc.get("contents"), 2, "...")

                if len(result) > 10:
                    file_handler = open(self.frags_dir + '/' + doc.get("name"),
                        'w+')
                    file_handler.write(result)
    def _getHits(self):
        reader = IndexReader.open(self.index)
        searcher = IndexSearcher(reader)

        # Sortierung nach Band- und Eintragsnummer (4: Wert als Integer behandeln)
        sortDict = {
            "ta": (("volume", SortField.Type.INT), ("number", SortField.Type.INT)),
            "year": (("year-start", SortField.Type.INT), ("year-end", SortField.Type.INT)),
            "author-title": (("author-sort", SortField.Type.STRING), ("title-sort", SortField.Type.STRING)),
            "title": (("title-sort", Locale.GERMAN),),
            "author": (("author-sort", Locale.GERMAN),),
        }

        sortFields = []

        reverse = not self.ascending

        for name in self.orderby:
            for fieldName, typeNum in sortDict.get(name, []):
                sortFields.append(SortField(fieldName, typeNum, reverse))

        if len(sortFields) == 0:
            sortFields = [SortField("volume", SortField.Type.INT), SortField("number", SortField.Type.INT)]

        sort = Sort(sortFields)

        topDocs = searcher.search(self.query, None, 80000, sort)
        hits = topDocs.scoreDocs
        self.hits = hits
        self.searcher = searcher

        lang = translation.get_language()
        if lang != "de":
            lang = "en"

        facets = {"author": {}, "pubtype": {}, "category-%s" % lang: {}}

        # Highlighting
        highlighter = Highlighter(SimpleHTMLFormatter('<span class="highlight">', '</span>'), QueryScorer(self.query))

        hitObjects = []
        fields = {}
        for hit in hits:
            doc = searcher.doc(hit.doc)
            # print unicode(doc)
            fields["score"] = hit.score
            fields["volume"] = doc["volume"]
            fields["number"] = doc["number"]
            fields["id"] = doc["id"]
            fields["title"] = doc["title"]
            fields["author"] = doc["author"]
            fields["authors"] = [field.stringValue() for field in doc.getFields("author")]
            for author in fields["authors"]:  # XXX
                facets["author"][author] = facets["author"].get(author, 0) + 1  # XXX

            fields["categories"] = [field.stringValue() for field in doc.getFields("category-%s" % lang)]
            for cat in fields["categories"]:
                facets["category-%s" % lang][cat] = facets["category-%s" % lang].get(cat, 0) + 1
            maxNumFragmentsRequired = 2
            fragmentSeparator = "...";
            pubtype = doc["pubtype"]
            fields["pubtype"] = pubtype
            facets["pubtype"][pubtype] = facets["pubtype"].get(pubtype, 0) + 1
            fields["city"] = doc["city"]
            fields["year"] = doc["year-start"]
            if fields["year"] and doc["year-end"] and doc["year-end"] != fields["year"]:
                fields["year"] += " - " + doc["year-end"]
            highlightFields = ("title", "author", "city", "year", "category")

            if "freetext" in self.criteria:
                for fieldName in highlightFields:
                    try:
                        tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName]))
                        newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired,
                                                              fragmentSeparator)
                        if len(newVal) > 0:
                            # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal)
                            fields[fieldName] = newVal
                    except:
                        continue

            for fieldName in highlightFields:
                if fieldName in self.criteria or fieldName + "-de" in self.criteria or fieldName + "-en" in self.criteria:
                    try:
                        tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName]))
                        newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired,
                                                              fragmentSeparator)
                        if len(newVal) > 0:
                            # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal)
                            fields[fieldName] = newVal
                    except:
                        continue
            """if "author" in self.criteria:
                try:
                    tokenStream = self.analyzer.tokenStream("author", lucene.StringReader(fields["author"]))
                    fields["author"] = highlighter.getBestFragments(tokenStream, fields["author"], maxNumFragmentsRequired, fragmentSeparator)
                except:
                        pass"""

            hitObjects.append(
                Hit(fields["id"], fields["volume"], fields["number"], fields["title"], fields["author"], fields["city"],
                    fields["year"], fields["categories"], fields["pubtype"], fields["score"]))

        facetsToDelete = []
        for facet in facets:
            if len(facets[facet]) < 2:
                facetsToDelete.append(facet)
                continue
            values = sorted(facets[facet].items(), key=itemgetter(0))
            values = sorted(values, key=itemgetter(1), reverse=True)
            facets[facet] = values[:25]
        for facet in facetsToDelete:
            del facets[facet]
        self.facets = facets
        reader.close()
        self.hitObjects = hitObjects
        return hitObjects
Exemplo n.º 15
0
    def run(self, writer=None, analyzer=None):

        if writer is None:
            writer = self.writer

        if analyzer is None:
            analyzer = self.analyzer

        searcher = IndexSearcher(DirectoryReader.open(\
        SimpleFSDirectory.open(File(self.store_dir))))
        while True:
            print()
            print("Hit enter with no input to quit.")
            command = input("Query:")
            if command == '':
                return

            print("Searching for:", command)
            query = QueryParser(Version.LUCENE_43, "contents",
                                analyzer).parse(command)

            # We'll just show the top 10 matching documents for now
            scoreDocs = searcher.search(query, 10).scoreDocs
            print("%s total matching documents." % len(scoreDocs))

            # Highlight the matching text in red
            highlighter = Highlighter(
                SimpleHTMLFormatter('<b><font color\
            ="red">', '</font></b>'), QueryScorer(query))

            # Using NullFragmenter since we still want to see
            # the whole document
            highlighter.setTextFragmenter(NullFragmenter())

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                # arg 3: the maximum number of fragments
                # arg 4: the separator used to intersperse the
                # document fragments (typically "...")
                # arg 3 and 4 don't really matter with NullFragmenter
                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.hits_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)

            # create hit fragments, if we want to show them
            # arg 1: fragment size
            highlighter.setTextFragmenter(SimpleFragmenter(200))

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.frags_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)