Python WhitespaceAnalyzer примеры, lucene.WhitespaceAnalyzer Python примеры использования

Пример #1

0

Показать файл

    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))

Пример #2

0

Показать файл

Файл: websearch.py Проект: AlexChang/ImageSearcher

def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs

Пример #3

0

Показать файл

Файл: SecurityFilterTest.py Проект: lauromoraes/pylucene

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        # Elwood
        document = Document()
        document.add(
            Field("owner", "elwood", Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "elwoods sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        # Jake
        document = Document()
        document.add(
            Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "jakes sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        writer.close()

Пример #4

0

Показать файл

Файл: searcher_qst.py Проект: elicassion/Zhi-Searcher

def init():
    global searcher, analyzer, vm
    vm = initVM()
    STORE_DIR = "index_qst"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

Пример #5

0

Показать файл

Файл: SpanQueryTest.py Проект: bpgriner01/pylucene

    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))

Пример #6

0

Показать файл

def init():
    global STORE_DIR, directory, searcher, analyzer, vm_env
    STORE_DIR = "index_lucene_v3_highlight"
    if (vm_env == None):
        vm_env = initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

Пример #7

0

Показать файл

def Searchfile(command, prior, page, RPP):
    STORE_DIR = "index_ans"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    store = run(searcher, analyzer, command, prior)
    searcher.close()
    start = (page - 1) * RPP
    end = start + RPP

    return store[start:end], len(store)

Пример #8

0

Показать файл

Файл: ScoreTest.py Проект: lauromoraes/pylucene

    def indexSingleFieldDocs(self, fields):

        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        for field in fields:
            doc = Document()
            doc.add(field)
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

Пример #9

0

Показать файл

Файл: PhraseQueryTest.py Проект: lauromoraes/pylucene

    def setUp(self):

        # set up sample document
        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(directory)

Пример #10

0

Показать файл

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        self.addPoint(writer, "El Charro", "restaurant", 1, 2)
        self.addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9)
        self.addPoint(writer, "Los Betos", "restaurant", 9, 6)
        self.addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.query = TermQuery(Term("type", "restaurant"))

Пример #11

0

Показать файл

Файл: AdvancedQueryParserTest.py Проект: lauromoraes/pylucene

    def setUp(self):

        self.analyzer = WhitespaceAnalyzer()
        self.directory = RAMDirectory()

        writer = IndexWriter(self.directory, self.analyzer, True, 
                             IndexWriter.MaxFieldLength.LIMITED)

        for i in xrange(1, 501):
            doc = Document()
            doc.add(Field("id", NumberUtils.pad(i),
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

        writer.close()

Пример #12

0

Показать файл

Файл: T9er.py Проект: lauromoraes/pylucene

    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return

        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(
                Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("t9", cls.t9(word), Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("length", str(len(word)), Field.Store.NO,
                      Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.optimize()
        writer.close()

        reader.close()

Пример #13

0

Показать файл

    def _createIndex(self, inputDF, colname):
        """
		function to create lucene index, iterates over inputDF row 
		by row, and indexes the relevant column

		By default - WhitespaceAnalyzer is used, other Analyzers are also available.
		"""

        # Create index directory
        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)

        # Inline indexing of column data
        inputDF.apply(lambda x: self._addDoc(x[colname], writer), axis=1)

        # Optimize, close and return
        writer.optimize()
        writer.close()
        return directory

Пример #14

0

Показать файл

    def setUp(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc1 = Document()
        doc1.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc1)

        doc2 = Document()
        doc2.add(
            Field("field", "the fast fox hopped over the hound",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc2)
        writer.close()

        self.searcher = IndexSearcher(directory, True)

Пример #15

0

Показать файл

    def testAnalyzer(self):

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryString = "category:/philosophy/eastern"

        parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        parser.setAutoGeneratePhraseQueries(True)
        query = parser.parse(queryString)

        self.assertEqual("category:\"philosophy eastern\"",
                         query.toString("contents"), "path got split, yikes!")

        perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer)
        perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer())
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            perFieldAnalyzer).parse(queryString)

        self.assertEqual("category:/philosophy/eastern",
                         query.toString("contents"),
                         "leave category field alone")

Пример #16

0

Показать файл

Файл: AnalyzerDemo.py Проект: lauromoraes/pylucene

class AnalyzerDemo(object):

    examples = [
        "The quick brown fox jumped over the lazy dogs",
        "XY&Z Corporation - [email protected]"
    ]

    analyzers = [
        WhitespaceAnalyzer(),
        SimpleAnalyzer(),
        StopAnalyzer(Version.LUCENE_CURRENT),
        StandardAnalyzer(Version.LUCENE_CURRENT)
    ]

    def main(cls, argv):

        # Use the embedded example strings, unless
        # command line arguments are specified, then use those.
        strings = cls.examples

        if len(argv) > 1:
            strings = argv[1:]

        for string in strings:
            cls.analyze(string)

    def analyze(cls, text):

        print 'Analyzing "%s"' % (text)

        for analyzer in cls.analyzers:
            name = type(analyzer).__name__
            print " %s:" % (name),
            AnalyzerUtils.displayTokens(analyzer, text)
            print
        print

    main = classmethod(main)
    analyze = classmethod(analyze)

Пример #17

0

Показать файл

    def setUp(self):

        animals = [
            "aardvark", "beaver", "coati", "dog", "elephant", "frog",
            "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur",
            "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion",
            "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak",
            "zebra"
        ]

        analyzer = WhitespaceAnalyzer()

        aTOmDirectory = RAMDirectory()
        nTOzDirectory = RAMDirectory()

        aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)
        nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)

        for animal in animals:
            doc = Document()
            doc.add(
                Field("animal", animal, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))

            if animal[0].lower() < "n":
                aTOmWriter.addDocument(doc)
            else:
                nTOzWriter.addDocument(doc)

        aTOmWriter.close()
        nTOzWriter.close()

        self.searchers = [
            IndexSearcher(aTOmDirectory),
            IndexSearcher(nTOzDirectory)
        ]

Пример #18

0

Показать файл

class AnalyzerDemo(object):

    examples = ["http://www.baidu.com/ www.baidu.com",
                "联系 本站 版权 所有 上海 交通 大学BBS 饮水思源 站 沪ICP备020861".decode('gbk')]
    
    analyzers = [WhitespaceAnalyzer(),
                 SimpleAnalyzer(),
                 StopAnalyzer(Version.LUCENE_CURRENT),
                 StandardAnalyzer(Version.LUCENE_CURRENT),
                 CJKAnalyzer(Version.LUCENE_CURRENT)]

    def main(cls, argv):

        # Use the embedded example strings, unless
        # command line arguments are specified, then use those.
        strings = cls.examples

        if len(argv) > 1:
            strings = argv[1:]

        for string in strings:
            cls.analyze(string)

    def analyze(cls, text):

        print 'Analyzing "%s"' %(text)

        for analyzer in cls.analyzers:
            name = type(analyzer).__name__
            print " %s:" %(name),
            AnalyzerUtils.displayTokens(analyzer, text)
            print
        print

    main = classmethod(main)
    analyze = classmethod(analyze)

Пример #19

0

Показать файл

class SpanQueryTest(TestCase):
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))

    def assertOnlyBrownFox(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits)
        self.assertEqual(0, topDocs.scoreDocs[0].doc, "wrong doc")

    def assertBothFoxes(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEqual(2, topDocs.totalHits)

    def assertNoMatches(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEquals(0, topDocs.totalHits)

    def testSpanTermQuery(self):

        self.assertOnlyBrownFox(self.brown)
        self.dumpSpans(self.brown)

    def testSpanFirstQuery(self):

        sfq = SpanFirstQuery(self.brown, 2)
        self.assertNoMatches(sfq)

        self.dumpSpans(sfq)

        sfq = SpanFirstQuery(self.brown, 3)
        self.dumpSpans(sfq)
        self.assertOnlyBrownFox(sfq)

    def testSpanNearQuery(self):

        quick_brown_dog = [self.quick, self.brown, self.dog]
        snq = SpanNearQuery(quick_brown_dog, 0, True)
        self.assertNoMatches(snq)
        self.dumpSpans(snq)

        snq = SpanNearQuery(quick_brown_dog, 4, True)
        self.assertNoMatches(snq)
        self.dumpSpans(snq)

        snq = SpanNearQuery(quick_brown_dog, 5, True)
        self.assertOnlyBrownFox(snq)
        self.dumpSpans(snq)

        # interesting - even a sloppy phrase query would require
        # more slop to match
        snq = SpanNearQuery([self.lazy, self.fox], 3, False)
        self.assertOnlyBrownFox(snq)
        self.dumpSpans(snq)

        pq = PhraseQuery()
        pq.add(Term("f", "lazy"))
        pq.add(Term("f", "fox"))
        pq.setSlop(4)
        self.assertNoMatches(pq)

        pq.setSlop(5)
        self.assertOnlyBrownFox(pq)

    def testSpanNotQuery(self):

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        self.assertBothFoxes(quick_fox)
        self.dumpSpans(quick_fox)

        quick_fox_dog = SpanNotQuery(quick_fox, self.dog)
        self.assertBothFoxes(quick_fox_dog)
        self.dumpSpans(quick_fox_dog)

        no_quick_red_fox = SpanNotQuery(quick_fox, self.red)
        self.assertOnlyBrownFox(no_quick_red_fox)
        self.dumpSpans(no_quick_red_fox)

    def testSpanOrQuery(self):

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        lazy_dog = SpanNearQuery([self.lazy, self.dog], 0, True)
        sleepy_cat = SpanNearQuery([self.sleepy, self.cat], 0, True)
        qf_near_ld = SpanNearQuery([quick_fox, lazy_dog], 3, True)

        self.assertOnlyBrownFox(qf_near_ld)
        self.dumpSpans(qf_near_ld)

        qf_near_sc = SpanNearQuery([quick_fox, sleepy_cat], 3, True)
        self.dumpSpans(qf_near_sc)

        orQ = SpanOrQuery([qf_near_ld, qf_near_sc])
        self.assertBothFoxes(orQ)
        self.dumpSpans(orQ)

    def testPlay(self):

        orQ = SpanOrQuery([self.quick, self.fox])
        self.dumpSpans(orQ)

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        sfq = SpanFirstQuery(quick_fox, 4)
        self.dumpSpans(sfq)

        self.dumpSpans(SpanTermQuery(Term("f", "the")))

        quick_brown = SpanNearQuery([self.quick, self.brown], 0, False)
        self.dumpSpans(quick_brown)

    def dumpSpans(self, query):

        spans = query.getSpans(self.reader)
        print "%s:" % query
        numSpans = 0

        scoreDocs = self.searcher.search(query, 50).scoreDocs
        scores = [0, 0]
        for scoreDoc in scoreDocs:
            scores[scoreDoc.doc] = scoreDoc.score

        while spans.next():
            numSpans += 1

            id = spans.doc()
            doc = self.reader.document(id)

            # for simplicity - assume tokens are in sequential,
            # positions, starting from 0
            stream = self.analyzer.tokenStream("contents",
                                               StringReader(doc.get("f")))
            term = stream.addAttribute(TermAttribute.class_)

            buffer = StringIO()
            buffer.write("   ")

            i = 0
            while stream.incrementToken():
                if i == spans.start():
                    buffer.write("<")

                buffer.write(term.term())
                if i + 1 == spans.end():
                    buffer.write(">")

                buffer.write(" ")
                i += 1

            buffer.write("(")
            buffer.write(str(scores[id]))
            buffer.write(") ")

            print buffer.getvalue()
            # print self.searcher.explain(query, id)

        if numSpans == 0:
            print "   No spans"

        print ''

Пример #20

0

Показать файл

Файл: SearchImages_webVersion.py Проект: SJTUCai/EE208-Intro2EE

def init():
    global STORE_DIR,directory,searcher,analyzer
    STORE_DIR = "image_index_v3"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

Пример #21

0

Показать файл

Файл: DocumentUpdateTest.py Проект: lauromoraes/pylucene

    def getAnalyzer(self):

        return WhitespaceAnalyzer()

Пример #22

0

Показать файл

Файл: QueryParserTest.py Проект: lauromoraes/pylucene

    def setUp(self):

        super(QueryParserTest, self).setUp()
        self.analyzer = WhitespaceAnalyzer()
        self.searcher = IndexSearcher(self.directory, True)

Пример #23

0

Показать файл

Файл: SpanQueryTest.py Проект: bpgriner01/pylucene

class SpanQueryTest(TestCase):
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))

    def assertOnlyBrownFox(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits)
        self.assertEqual(0, topDocs.scoreDocs[0].doc, "wrong doc")

    def assertBothFoxes(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEqual(2, topDocs.totalHits)

    def assertNoMatches(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEquals(0, topDocs.totalHits)

    def testSpanTermQuery(self):

        self.assertOnlyBrownFox(self.brown)
        self.dumpSpans(self.brown)

    def testSpanFirstQuery(self):

        sfq = SpanFirstQuery(self.brown, 2)
        self.assertNoMatches(sfq)

        self.dumpSpans(sfq)

        sfq = SpanFirstQuery(self.brown, 3)
        self.dumpSpans(sfq)
        self.assertOnlyBrownFox(sfq)

    def testSpanNearQuery(self):

        quick_brown_dog = [self.quick, self.brown, self.dog]
        snq = SpanNearQuery(quick_brown_dog, 0, True)
        self.assertNoMatches(snq)
        self.dumpSpans(snq)

        snq = SpanNearQuery(quick_brown_dog, 4, True)
        self.assertNoMatches(snq)
        self.dumpSpans(snq)

        snq = SpanNearQuery(quick_brown_dog, 5, True)
        self.assertOnlyBrownFox(snq)
        self.dumpSpans(snq)

        # interesting - even a sloppy phrase query would require
        # more slop to match
        snq = SpanNearQuery([self.lazy, self.fox], 3, False)
        self.assertOnlyBrownFox(snq)
        self.dumpSpans(snq)

        pq = PhraseQuery()
        pq.add(Term("f", "lazy"))
        pq.add(Term("f", "fox"))
        pq.setSlop(4)
        self.assertNoMatches(pq)

        pq.setSlop(5)
        self.assertOnlyBrownFox(pq)

    def testSpanNotQuery(self):

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        self.assertBothFoxes(quick_fox)
        self.dumpSpans(quick_fox)

        quick_fox_dog = SpanNotQuery(quick_fox, self.dog)
        self.assertBothFoxes(quick_fox_dog)
        self.dumpSpans(quick_fox_dog)

        no_quick_red_fox = SpanNotQuery(quick_fox, self.red)
        self.assertOnlyBrownFox(no_quick_red_fox)
        self.dumpSpans(no_quick_red_fox)

    def testSpanOrQuery(self):

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        lazy_dog = SpanNearQuery([self.lazy, self.dog], 0, True)
        sleepy_cat = SpanNearQuery([self.sleepy, self.cat], 0, True)
        qf_near_ld = SpanNearQuery([quick_fox, lazy_dog], 3, True)

        self.assertOnlyBrownFox(qf_near_ld)
        self.dumpSpans(qf_near_ld)

        qf_near_sc = SpanNearQuery([quick_fox, sleepy_cat], 3, True)
        self.dumpSpans(qf_near_sc)

        orQ = SpanOrQuery([qf_near_ld, qf_near_sc])
        self.assertBothFoxes(orQ)
        self.dumpSpans(orQ)

    def testPlay(self):

        orQ = SpanOrQuery([self.quick, self.fox])
        self.dumpSpans(orQ)

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        sfq = SpanFirstQuery(quick_fox, 4)
        self.dumpSpans(sfq)

        self.dumpSpans(SpanTermQuery(Term("f", "the")))

        quick_brown = SpanNearQuery([self.quick, self.brown], 0, False)
        self.dumpSpans(quick_brown)

    def dumpSpans(self, query):

        spans = query.getSpans(self.reader)
        print "%s:" % query
        numSpans = 0

        scoreDocs = self.searcher.search(query, 50).scoreDocs
        scores = [0, 0]
        for scoreDoc in scoreDocs:
            scores[scoreDoc.doc] = scoreDoc.score

        while spans.next():
            numSpans += 1

            id = spans.doc()
            doc = self.reader.document(id)

            # for simplicity - assume tokens are in sequential,
            # positions, starting from 0
            stream = self.analyzer.tokenStream("contents", StringReader(doc.get("f")))
            term = stream.addAttribute(TermAttribute.class_)

            buffer = StringIO()
            buffer.write("   ")

            i = 0
            while stream.incrementToken():
                if i == spans.start():
                    buffer.write("<")

                buffer.write(term.term())
                if i + 1 == spans.end():
                    buffer.write(">")

                buffer.write(" ")
                i += 1

            buffer.write("(")
            buffer.write(str(scores[id]))
            buffer.write(") ")

            print buffer.getvalue()
            # print self.searcher.explain(query, id)

        if numSpans == 0:
            print "   No spans"

        print ""

Пример #24

0

Показать файл

Файл: IndexFiles_v3.py Проект: SJTUCai/EE208-Intro2EE

                          Field.Index.NOT_ANALYZED))
                domin = urlparse.urlsplit(url)[1].split(':')[0]
                doc.add(
                    Field("site", domin, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                title = files[filename][1]
                doc.add(
                    Field("title", title, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                print filename, path, url, domin, title
                if len(contents) > 0:
                    doc.add(
                        Field("contents", contents, Field.Store.YES,
                              Field.Index.ANALYZED))
                else:
                    print("warning: no content in %s" % filename)
                writer.addDocument(doc)
                # except Exception, e:
                # print "Failed in indexDocs:", e


if __name__ == '__main__':
    #lucene.initVM() #
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    start = datetime.now()
    IndexFiles('ori_txt', "index_lucene_v3_highlight",
               WhitespaceAnalyzer(Version.LUCENE_CURRENT))
    end = datetime.now()
    print end - start

Пример #25

0

Показать файл

Файл: FacetExample.py Проект: Riolu/Project_Set

    def index(cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        ver = lucene.Version.LUCENE_35
        config = IndexWriterConfig(ver, WhitespaceAnalyzer(ver))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir,
                                       IndexWriterConfig.OpenMode.CREATE)
        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [createCategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List.
            # see http://docs.oracle.com/javase/1.5.0/docs/api/java/util/Arrays.html#asList(T...)
            facetList = lucene.Arrays.asList(facetList)
            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # we do not alter indexing parameters
            # a category document builder will add the categories to a document once build() is called
            categoryDocBuilder = CategoryDocumentBuilder(
                taxo).setCategoryPaths(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(
                Field(TITLE, docTitles[docNum], Field.Store.YES,
                      Field.Index.ANALYZED))
            doc.add(
                Field(TEXT, docTexts[docNum], Field.Store.NO,
                      Field.Index.ANALYZED))

            # invoke the category document builder for adding categories to the document and,
            # as required, to the taxonomy index
            categoryDocBuilder.build(doc)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded += 1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,
                                                                nFacetsAdded)

Пример #26

0

Показать файл

Файл: IndexFilesForImage_v3.py Проект: SJTUCai/EE208-Intro2EE

            ls = line.split('seg^*')
            url = ls[0]
            title = ls[1] 
            src = ls[2]
            alt = ls[3]
            picDict[src] = [url,title,alt]
        f.close()
        for src in picDict:
            doc = Document()
            doc.add(Field("src", src,
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("url", picDict[src][0],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("title", picDict[src][1],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("alt", picDict[src][2],
                                 Field.Store.YES,
                                 Field.Index.ANALYZED))
            writer.addDocument(doc)
if __name__ == '__main__': 
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    print 'lucene', lucene.VERSION
    start = datetime.now()
    IndexFiles('txt', "image_index_v3", WhitespaceAnalyzer(Version.LUCENE_CURRENT))
    end = datetime.now()
    print end - start

Python WhitespaceAnalyzer примеры использования