Пример #1
0
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))
Пример #2
0
def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs
Пример #3
0
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        # Elwood
        document = Document()
        document.add(
            Field("owner", "elwood", Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "elwoods sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        # Jake
        document = Document()
        document.add(
            Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "jakes sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        writer.close()
Пример #4
0
def init():
    global searcher, analyzer, vm
    vm = initVM()
    STORE_DIR = "index_qst"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
Пример #5
0
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))
Пример #6
0
def init():
    global STORE_DIR, directory, searcher, analyzer, vm_env
    STORE_DIR = "index_lucene_v3_highlight"
    if (vm_env == None):
        vm_env = initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
Пример #7
0
def Searchfile(command, prior, page, RPP):
    STORE_DIR = "index_ans"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    store = run(searcher, analyzer, command, prior)
    searcher.close()
    start = (page - 1) * RPP
    end = start + RPP

    return store[start:end], len(store)
Пример #8
0
    def indexSingleFieldDocs(self, fields):

        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        for field in fields:
            doc = Document()
            doc.add(field)
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Пример #9
0
    def setUp(self):

        # set up sample document
        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(directory)
Пример #10
0
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        self.addPoint(writer, "El Charro", "restaurant", 1, 2)
        self.addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9)
        self.addPoint(writer, "Los Betos", "restaurant", 9, 6)
        self.addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.query = TermQuery(Term("type", "restaurant"))
    def setUp(self):

        self.analyzer = WhitespaceAnalyzer()
        self.directory = RAMDirectory()

        writer = IndexWriter(self.directory, self.analyzer, True, 
                             IndexWriter.MaxFieldLength.LIMITED)

        for i in xrange(1, 501):
            doc = Document()
            doc.add(Field("id", NumberUtils.pad(i),
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

        writer.close()
Пример #12
0
    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return

        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(
                Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("t9", cls.t9(word), Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("length", str(len(word)), Field.Store.NO,
                      Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.optimize()
        writer.close()

        reader.close()
Пример #13
0
    def _createIndex(self, inputDF, colname):
        """
		function to create lucene index, iterates over inputDF row 
		by row, and indexes the relevant column

		By default - WhitespaceAnalyzer is used, other Analyzers are also available.
		"""

        # Create index directory
        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)

        # Inline indexing of column data
        inputDF.apply(lambda x: self._addDoc(x[colname], writer), axis=1)

        # Optimize, close and return
        writer.optimize()
        writer.close()
        return directory
Пример #14
0
    def setUp(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc1 = Document()
        doc1.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc1)

        doc2 = Document()
        doc2.add(
            Field("field", "the fast fox hopped over the hound",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc2)
        writer.close()

        self.searcher = IndexSearcher(directory, True)
Пример #15
0
    def testAnalyzer(self):

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryString = "category:/philosophy/eastern"

        parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        parser.setAutoGeneratePhraseQueries(True)
        query = parser.parse(queryString)

        self.assertEqual("category:\"philosophy eastern\"",
                         query.toString("contents"), "path got split, yikes!")

        perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer)
        perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer())
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            perFieldAnalyzer).parse(queryString)

        self.assertEqual("category:/philosophy/eastern",
                         query.toString("contents"),
                         "leave category field alone")
Пример #16
0
class AnalyzerDemo(object):

    examples = [
        "The quick brown fox jumped over the lazy dogs",
        "XY&Z Corporation - [email protected]"
    ]

    analyzers = [
        WhitespaceAnalyzer(),
        SimpleAnalyzer(),
        StopAnalyzer(Version.LUCENE_CURRENT),
        StandardAnalyzer(Version.LUCENE_CURRENT)
    ]

    def main(cls, argv):

        # Use the embedded example strings, unless
        # command line arguments are specified, then use those.
        strings = cls.examples

        if len(argv) > 1:
            strings = argv[1:]

        for string in strings:
            cls.analyze(string)

    def analyze(cls, text):

        print 'Analyzing "%s"' % (text)

        for analyzer in cls.analyzers:
            name = type(analyzer).__name__
            print " %s:" % (name),
            AnalyzerUtils.displayTokens(analyzer, text)
            print
        print

    main = classmethod(main)
    analyze = classmethod(analyze)
Пример #17
0
    def setUp(self):

        animals = [
            "aardvark", "beaver", "coati", "dog", "elephant", "frog",
            "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur",
            "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion",
            "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak",
            "zebra"
        ]

        analyzer = WhitespaceAnalyzer()

        aTOmDirectory = RAMDirectory()
        nTOzDirectory = RAMDirectory()

        aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)
        nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)

        for animal in animals:
            doc = Document()
            doc.add(
                Field("animal", animal, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))

            if animal[0].lower() < "n":
                aTOmWriter.addDocument(doc)
            else:
                nTOzWriter.addDocument(doc)

        aTOmWriter.close()
        nTOzWriter.close()

        self.searchers = [
            IndexSearcher(aTOmDirectory),
            IndexSearcher(nTOzDirectory)
        ]
Пример #18
0
class AnalyzerDemo(object):

    examples = ["http://www.baidu.com/ www.baidu.com",
                "联系 本站 版权 所有 上海 交通 大学BBS 饮水思源 站 沪ICP备020861".decode('gbk')]
    
    analyzers = [WhitespaceAnalyzer(),
                 SimpleAnalyzer(),
                 StopAnalyzer(Version.LUCENE_CURRENT),
                 StandardAnalyzer(Version.LUCENE_CURRENT),
                 CJKAnalyzer(Version.LUCENE_CURRENT)]

    def main(cls, argv):

        # Use the embedded example strings, unless
        # command line arguments are specified, then use those.
        strings = cls.examples

        if len(argv) > 1:
            strings = argv[1:]

        for string in strings:
            cls.analyze(string)

    def analyze(cls, text):

        print 'Analyzing "%s"' %(text)

        for analyzer in cls.analyzers:
            name = type(analyzer).__name__
            print " %s:" %(name),
            AnalyzerUtils.displayTokens(analyzer, text)
            print
        print

    main = classmethod(main)
    analyze = classmethod(analyze)
Пример #19
0
class SpanQueryTest(TestCase):
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))

    def assertOnlyBrownFox(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits)
        self.assertEqual(0, topDocs.scoreDocs[0].doc, "wrong doc")

    def assertBothFoxes(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEqual(2, topDocs.totalHits)

    def assertNoMatches(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEquals(0, topDocs.totalHits)

    def testSpanTermQuery(self):

        self.assertOnlyBrownFox(self.brown)
        self.dumpSpans(self.brown)

    def testSpanFirstQuery(self):

        sfq = SpanFirstQuery(self.brown, 2)
        self.assertNoMatches(sfq)

        self.dumpSpans(sfq)

        sfq = SpanFirstQuery(self.brown, 3)
        self.dumpSpans(sfq)
        self.assertOnlyBrownFox(sfq)

    def testSpanNearQuery(self):

        quick_brown_dog = [self.quick, self.brown, self.dog]
        snq = SpanNearQuery(quick_brown_dog, 0, True)
        self.assertNoMatches(snq)
        self.dumpSpans(snq)

        snq = SpanNearQuery(quick_brown_dog, 4, True)
        self.assertNoMatches(snq)
        self.dumpSpans(snq)

        snq = SpanNearQuery(quick_brown_dog, 5, True)
        self.assertOnlyBrownFox(snq)
        self.dumpSpans(snq)

        # interesting - even a sloppy phrase query would require
        # more slop to match
        snq = SpanNearQuery([self.lazy, self.fox], 3, False)
        self.assertOnlyBrownFox(snq)
        self.dumpSpans(snq)

        pq = PhraseQuery()
        pq.add(Term("f", "lazy"))
        pq.add(Term("f", "fox"))
        pq.setSlop(4)
        self.assertNoMatches(pq)

        pq.setSlop(5)
        self.assertOnlyBrownFox(pq)

    def testSpanNotQuery(self):

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        self.assertBothFoxes(quick_fox)
        self.dumpSpans(quick_fox)

        quick_fox_dog = SpanNotQuery(quick_fox, self.dog)
        self.assertBothFoxes(quick_fox_dog)
        self.dumpSpans(quick_fox_dog)

        no_quick_red_fox = SpanNotQuery(quick_fox, self.red)
        self.assertOnlyBrownFox(no_quick_red_fox)
        self.dumpSpans(no_quick_red_fox)

    def testSpanOrQuery(self):

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        lazy_dog = SpanNearQuery([self.lazy, self.dog], 0, True)
        sleepy_cat = SpanNearQuery([self.sleepy, self.cat], 0, True)
        qf_near_ld = SpanNearQuery([quick_fox, lazy_dog], 3, True)

        self.assertOnlyBrownFox(qf_near_ld)
        self.dumpSpans(qf_near_ld)

        qf_near_sc = SpanNearQuery([quick_fox, sleepy_cat], 3, True)
        self.dumpSpans(qf_near_sc)

        orQ = SpanOrQuery([qf_near_ld, qf_near_sc])
        self.assertBothFoxes(orQ)
        self.dumpSpans(orQ)

    def testPlay(self):

        orQ = SpanOrQuery([self.quick, self.fox])
        self.dumpSpans(orQ)

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        sfq = SpanFirstQuery(quick_fox, 4)
        self.dumpSpans(sfq)

        self.dumpSpans(SpanTermQuery(Term("f", "the")))

        quick_brown = SpanNearQuery([self.quick, self.brown], 0, False)
        self.dumpSpans(quick_brown)

    def dumpSpans(self, query):

        spans = query.getSpans(self.reader)
        print "%s:" % query
        numSpans = 0

        scoreDocs = self.searcher.search(query, 50).scoreDocs
        scores = [0, 0]
        for scoreDoc in scoreDocs:
            scores[scoreDoc.doc] = scoreDoc.score

        while spans.next():
            numSpans += 1

            id = spans.doc()
            doc = self.reader.document(id)

            # for simplicity - assume tokens are in sequential,
            # positions, starting from 0
            stream = self.analyzer.tokenStream("contents",
                                               StringReader(doc.get("f")))
            term = stream.addAttribute(TermAttribute.class_)

            buffer = StringIO()
            buffer.write("   ")

            i = 0
            while stream.incrementToken():
                if i == spans.start():
                    buffer.write("<")

                buffer.write(term.term())
                if i + 1 == spans.end():
                    buffer.write(">")

                buffer.write(" ")
                i += 1

            buffer.write("(")
            buffer.write(str(scores[id]))
            buffer.write(") ")

            print buffer.getvalue()
            # print self.searcher.explain(query, id)

        if numSpans == 0:
            print "   No spans"

        print ''
def init():
    global STORE_DIR,directory,searcher,analyzer
    STORE_DIR = "image_index_v3"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
Пример #21
0
    def getAnalyzer(self):

        return WhitespaceAnalyzer()
Пример #22
0
    def setUp(self):

        super(QueryParserTest, self).setUp()
        self.analyzer = WhitespaceAnalyzer()
        self.searcher = IndexSearcher(self.directory, True)
Пример #23
0
class SpanQueryTest(TestCase):
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))

    def assertOnlyBrownFox(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits)
        self.assertEqual(0, topDocs.scoreDocs[0].doc, "wrong doc")

    def assertBothFoxes(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEqual(2, topDocs.totalHits)

    def assertNoMatches(self, query):

        topDocs = self.searcher.search(query, 50)
        self.assertEquals(0, topDocs.totalHits)

    def testSpanTermQuery(self):

        self.assertOnlyBrownFox(self.brown)
        self.dumpSpans(self.brown)

    def testSpanFirstQuery(self):

        sfq = SpanFirstQuery(self.brown, 2)
        self.assertNoMatches(sfq)

        self.dumpSpans(sfq)

        sfq = SpanFirstQuery(self.brown, 3)
        self.dumpSpans(sfq)
        self.assertOnlyBrownFox(sfq)

    def testSpanNearQuery(self):

        quick_brown_dog = [self.quick, self.brown, self.dog]
        snq = SpanNearQuery(quick_brown_dog, 0, True)
        self.assertNoMatches(snq)
        self.dumpSpans(snq)

        snq = SpanNearQuery(quick_brown_dog, 4, True)
        self.assertNoMatches(snq)
        self.dumpSpans(snq)

        snq = SpanNearQuery(quick_brown_dog, 5, True)
        self.assertOnlyBrownFox(snq)
        self.dumpSpans(snq)

        # interesting - even a sloppy phrase query would require
        # more slop to match
        snq = SpanNearQuery([self.lazy, self.fox], 3, False)
        self.assertOnlyBrownFox(snq)
        self.dumpSpans(snq)

        pq = PhraseQuery()
        pq.add(Term("f", "lazy"))
        pq.add(Term("f", "fox"))
        pq.setSlop(4)
        self.assertNoMatches(pq)

        pq.setSlop(5)
        self.assertOnlyBrownFox(pq)

    def testSpanNotQuery(self):

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        self.assertBothFoxes(quick_fox)
        self.dumpSpans(quick_fox)

        quick_fox_dog = SpanNotQuery(quick_fox, self.dog)
        self.assertBothFoxes(quick_fox_dog)
        self.dumpSpans(quick_fox_dog)

        no_quick_red_fox = SpanNotQuery(quick_fox, self.red)
        self.assertOnlyBrownFox(no_quick_red_fox)
        self.dumpSpans(no_quick_red_fox)

    def testSpanOrQuery(self):

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        lazy_dog = SpanNearQuery([self.lazy, self.dog], 0, True)
        sleepy_cat = SpanNearQuery([self.sleepy, self.cat], 0, True)
        qf_near_ld = SpanNearQuery([quick_fox, lazy_dog], 3, True)

        self.assertOnlyBrownFox(qf_near_ld)
        self.dumpSpans(qf_near_ld)

        qf_near_sc = SpanNearQuery([quick_fox, sleepy_cat], 3, True)
        self.dumpSpans(qf_near_sc)

        orQ = SpanOrQuery([qf_near_ld, qf_near_sc])
        self.assertBothFoxes(orQ)
        self.dumpSpans(orQ)

    def testPlay(self):

        orQ = SpanOrQuery([self.quick, self.fox])
        self.dumpSpans(orQ)

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        sfq = SpanFirstQuery(quick_fox, 4)
        self.dumpSpans(sfq)

        self.dumpSpans(SpanTermQuery(Term("f", "the")))

        quick_brown = SpanNearQuery([self.quick, self.brown], 0, False)
        self.dumpSpans(quick_brown)

    def dumpSpans(self, query):

        spans = query.getSpans(self.reader)
        print "%s:" % query
        numSpans = 0

        scoreDocs = self.searcher.search(query, 50).scoreDocs
        scores = [0, 0]
        for scoreDoc in scoreDocs:
            scores[scoreDoc.doc] = scoreDoc.score

        while spans.next():
            numSpans += 1

            id = spans.doc()
            doc = self.reader.document(id)

            # for simplicity - assume tokens are in sequential,
            # positions, starting from 0
            stream = self.analyzer.tokenStream("contents", StringReader(doc.get("f")))
            term = stream.addAttribute(TermAttribute.class_)

            buffer = StringIO()
            buffer.write("   ")

            i = 0
            while stream.incrementToken():
                if i == spans.start():
                    buffer.write("<")

                buffer.write(term.term())
                if i + 1 == spans.end():
                    buffer.write(">")

                buffer.write(" ")
                i += 1

            buffer.write("(")
            buffer.write(str(scores[id]))
            buffer.write(") ")

            print buffer.getvalue()
            # print self.searcher.explain(query, id)

        if numSpans == 0:
            print "   No spans"

        print ""
Пример #24
0
                          Field.Index.NOT_ANALYZED))
                domin = urlparse.urlsplit(url)[1].split(':')[0]
                doc.add(
                    Field("site", domin, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                title = files[filename][1]
                doc.add(
                    Field("title", title, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                print filename, path, url, domin, title
                if len(contents) > 0:
                    doc.add(
                        Field("contents", contents, Field.Store.YES,
                              Field.Index.ANALYZED))
                else:
                    print("warning: no content in %s" % filename)
                writer.addDocument(doc)
                # except Exception, e:
                # print "Failed in indexDocs:", e


if __name__ == '__main__':
    #lucene.initVM() #
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    start = datetime.now()
    IndexFiles('ori_txt', "index_lucene_v3_highlight",
               WhitespaceAnalyzer(Version.LUCENE_CURRENT))
    end = datetime.now()
    print end - start
Пример #25
0
    def index(cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        ver = lucene.Version.LUCENE_35
        config = IndexWriterConfig(ver, WhitespaceAnalyzer(ver))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir,
                                       IndexWriterConfig.OpenMode.CREATE)
        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [createCategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List.
            # see http://docs.oracle.com/javase/1.5.0/docs/api/java/util/Arrays.html#asList(T...)
            facetList = lucene.Arrays.asList(facetList)
            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # we do not alter indexing parameters
            # a category document builder will add the categories to a document once build() is called
            categoryDocBuilder = CategoryDocumentBuilder(
                taxo).setCategoryPaths(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(
                Field(TITLE, docTitles[docNum], Field.Store.YES,
                      Field.Index.ANALYZED))
            doc.add(
                Field(TEXT, docTexts[docNum], Field.Store.NO,
                      Field.Index.ANALYZED))

            # invoke the category document builder for adding categories to the document and,
            # as required, to the taxonomy index
            categoryDocBuilder.build(doc)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded += 1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,
                                                                nFacetsAdded)
            ls = line.split('seg^*')
            url = ls[0]
            title = ls[1] 
            src = ls[2]
            alt = ls[3]
            picDict[src] = [url,title,alt]
        f.close()
        for src in picDict:
            doc = Document()
            doc.add(Field("src", src,
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("url", picDict[src][0],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("title", picDict[src][1],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("alt", picDict[src][2],
                                 Field.Store.YES,
                                 Field.Index.ANALYZED))
            writer.addDocument(doc)
if __name__ == '__main__': 
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    print 'lucene', lucene.VERSION
    start = datetime.now()
    IndexFiles('txt', "image_index_v3", WhitespaceAnalyzer(Version.LUCENE_CURRENT))
    end = datetime.now()
    print end - start