Python Document.Document示例，lucene.Document.Document Python示例

示例#1

0

显示文件

文件： SecurityFilterTest.py 项目： lauromoraes/pylucene

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        # Elwood
        document = Document()
        document.add(
            Field("owner", "elwood", Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "elwoods sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        # Jake
        document = Document()
        document.add(
            Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "jakes sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        writer.close()

示例#2

0

显示文件

    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))

示例#3

0

显示文件

    def addDocuments(self, dir):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        #
        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs
        #

        for word in self.docs:
            doc = Document()
            doc.add(
                Field("keyword", word, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO))
            doc.add(
                Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

示例#4

0

显示文件

文件： app.py 项目： ProjectLISM/GUI

def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()

示例#5

0

显示文件

文件： DocumentUpdateTest.py 项目： lauromoraes/pylucene

    def testUpdate(self):

        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))

        reader = IndexReader.open(self.dir, False)
        reader.deleteDocuments(Term("city", "Amsterdam"))
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO))
        doc.add(
            Field("contents", "St. Petersburg has lots of bridges",
                  Field.Store.NO, Field.Index.ANALYZED))
        doc.add(
            Field("city", "St. Petersburg", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.optimize()
        writer.close()

        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
        self.assertEqual(1, self.getHitCount("city", "Petersburg"))

示例#6

0

显示文件

def indexDocuments():
    # empty index directory
    indexDir = Wikipedia.directory + 'index/'
    for filename in os.listdir(indexDir):
        os.remove(indexDir + filename)

    # index documents
    lucene.initVM()
    version = Version.LUCENE_CURRENT
    analyzer = EnglishAnalyzer(version)
    writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True,
                         IndexWriter.MaxFieldLength.LIMITED)

    for article in Wikipedia():
        doc = Document()
        doc.add(
            Field('id', str(article['id'][0]), Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('title', article['url'], Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('content', article['text'], Field.Store.NO,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)

    print 'Optimization'
    writer.optimize()
    writer.close()

示例#7

0

显示文件

    def main(cls, argv):

        if len(argv) < 5:
            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
            return

        docsInIndex = int(argv[1])

        # create an index called 'index-dir' in a temp directory
        indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
                                'index-dir')
        dir = FSDirectory.getDirectory(indexDir, True)
        analyzer = SimpleAnalyzer()
        writer = IndexWriter(dir, analyzer, True)

        # set variables that affect speed of indexing
        writer.setMergeFactor(int(argv[2]))
        writer.setMaxMergeDocs(int(argv[3]))
        writer.setMaxBufferedDocs(int(argv[4]))
        # writer.infoStream = System.out

        print "Merge factor:  ", writer.getMergeFactor()
        print "Max merge docs:", writer.getMaxMergeDocs()
        print "Max buffered docs:", writer.getMaxBufferedDocs()

        start = time()
        for i in xrange(docsInIndex):
            doc = Document()
            doc.add(
                Field("fieldname", "Bibamus", Field.Store.YES,
                      Field.Index.TOKENIZED))
            writer.addDocument(doc)

        writer.close()
        print "Time: ", timedelta(seconds=time() - start)

示例#8

0

显示文件

文件： indexer.py 项目： mefagan/relevancefeedback-

def createIndex():
    #initialize lucene and jvm
    print("started indexer")
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"

    
    #get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    #get index storage
    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
   
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    src_dir = 'html_files'
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        i += 1
        document, errors = parsehtml(data)
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    writer.optimize()
    writer.close()

示例#9

0

显示文件

文件： pylucene_test.py 项目： SamChen1981/spider-1

def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()

示例#10

0

显示文件

 def indexDocs(self, root, writer):
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             if not filename.endswith('.txt'):
                 continue
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 for line in file:
                     doc = Document()
                     arr = line.split('\t')
                     field = Field("name", arr[2].lower(), Field.Store.YES,
                                   Field.Index.TOKENIZED)
                     field.setBoost(1.5)
                     doc.add(field)
                     doc.add(
                         Field("alternate_names", arr[3].lower(),
                               Field.Store.YES, Field.Index.TOKENIZED))
                     doc.add(
                         Field("state", arr[10].lower(), Field.Store.YES,
                               Field.Index.TOKENIZED))
                     doc.add(
                         Field("population", arr[14], Field.Store.YES,
                               Field.Index.UN_TOKENIZED))
                     if int(arr[14]) > 1000000:
                         doc.setBoost(1.2)
                     writer.addDocument(doc)
                 file.close()
             except Exception, e:
                 print "Failed in indexDocs:", e

示例#11

0

显示文件

文件： IndexFilesForImage_v3.py 项目： SJTUCai/EE208-Intro2EE

    def indexDocs(self, root, writer):

        f = codecs.open('picIndex.txt','r',encoding='utf-8')
        picDict = {}
        for line in f.xreadlines():
            ls = line.split('seg^*')
            url = ls[0]
            title = ls[1] 
            src = ls[2]
            alt = ls[3]
            picDict[src] = [url,title,alt]
        f.close()
        for src in picDict:
            doc = Document()
            doc.add(Field("src", src,
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("url", picDict[src][0],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("title", picDict[src][1],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("alt", picDict[src][2],
                                 Field.Store.YES,
                                 Field.Index.ANALYZED))
            writer.addDocument(doc)

示例#12

0

显示文件

    def _addDoc(self, text, writer):
        """
		function to add documents in the lucene index. 
		text fields are indexed by the name "field"
		"""

        doc = Document()
        doc.add(Field("field", text, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

示例#13

0

显示文件

文件： search.py 项目： JeffAMcGee/crowdy

 def addCrowd(self, id, text):
     doc = Document()
     doc.add(
         Field(CrowdFields.id, id, Field.Store.YES,
               Field.Index.NOT_ANALYZED))
     doc.add(
         Field(CrowdFields.text, text, Field.Store.YES,
               Field.Index.ANALYZED))
     self.writer.updateDocument(Term(CrowdFields.id, id), doc)

示例#14

0

显示文件

文件： Indexer.py 项目： BurnedRobot/SearchEngine

def _IndexField(field_list, content):
    i = 0
    doc = Document()
    while i < len(field_list):
        if (field_list[i]['StringField'] is not False):
            _IndexStringField(doc, field_list[i]['FieldName'], content[i])
        elif (field_list[i]['NumericField'] is not False):
            _IndexNumericField(doc, field_list[i]['FieldName'], content[i])
        i += 1
    return doc

示例#15

0

显示文件

    def setUp(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc1 = Document()
        doc1.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc1)

        doc2 = Document()
        doc2.add(
            Field("field", "the fast fox hopped over the hound",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc2)
        writer.close()

        self.searcher = IndexSearcher(directory, True)

示例#16

0

显示文件

 def addDocuments(self, _id, title, content):
     doc = Document()
     doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED))
     if title is not None and len(title) > 0:
         doc.add(
             Field("titleKeyword", title, Field.Store.NO,
                   Field.Index.ANALYZED))
     if content is not None and len(content) > 0:
         doc.add(
             Field("contentKeyword", content, Field.Store.NO,
                   Field.Index.ANALYZED))
     self.index_writer.addDocument(doc)

示例#17

0

显示文件

文件： ScoreTest.py 项目： lauromoraes/pylucene

    def indexSingleFieldDocs(self, fields):

        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        for field in fields:
            doc = Document()
            doc.add(field)
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

示例#18

0

显示文件

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, self.porterAnalyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("contents",
                      "The quick brown fox jumps over the lazy dogs",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

示例#19

0

显示文件

def indexFile(dir, filename):

    path = os.path.join(dir, filename)
    print "  File: ", filename

    if filename.endswith('.gz'):
        child = Popen('gunzip -c ' + path +
                      ' | groff -t -e -E -mandoc -Tascii | col -bx',
                      shell=True,
                      stdout=PIPE,
                      cwd=os.path.dirname(dir)).stdout
        command, section = re.search('^(.*)\.(.*)\.gz$', filename).groups()
    else:
        child = Popen('groff -t -e -E -mandoc -Tascii ' + path + ' | col -bx',
                      shell=True,
                      stdout=PIPE,
                      cwd=os.path.dirname(dir)).stdout
        command, section = re.search('^(.*)\.(.*)$', filename).groups()

    data = child.read()
    err = child.close()
    if err:
        raise RuntimeError, '%s failed with exit code %d' % (command, err)

    matches = re.search('^NAME$(.*?)^\S', data, re.MULTILINE | re.DOTALL)
    name = matches and matches.group(1) or ''

    matches = re.search('^(?:SYNOPSIS|SYNOPSYS)$(.*?)^\S', data,
                        re.MULTILINE | re.DOTALL)
    synopsis = matches and matches.group(1) or ''

    matches = re.search('^(?:DESCRIPTION|OVERVIEW)$(.*?)', data,
                        re.MULTILINE | re.DOTALL)
    description = matches and matches.group(1) or ''

    doc = Document()
    doc.add(
        Field("command", command, Field.Store.YES, Field.Index.NOT_ANALYZED))
    doc.add(
        Field("section", section, Field.Store.YES, Field.Index.NOT_ANALYZED))
    doc.add(Field("name", name.strip(), Field.Store.YES, Field.Index.ANALYZED))
    doc.add(
        Field("synopsis", synopsis.strip(), Field.Store.YES,
              Field.Index.ANALYZED))
    doc.add(
        Field("keywords", ' '.join((command, name, synopsis, description)),
              Field.Store.NO, Field.Index.ANALYZED))
    doc.add(
        Field("filename", os.path.abspath(path), Field.Store.YES,
              Field.Index.NOT_ANALYZED))

    writer.addDocument(doc)

示例#20

0

显示文件

    def addPoint(self, writer, name, type, x, y):

        doc = Document()
        doc.add(Field("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("type", type, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("x", str(x), Field.Store.YES,
                  Field.Index.NOT_ANALYZED_NO_NORMS))
        doc.add(
            Field("y", str(y), Field.Store.YES,
                  Field.Index.NOT_ANALYZED_NO_NORMS))

        writer.addDocument(doc)

示例#21

0

显示文件

文件： PhraseQueryTest.py 项目： lauromoraes/pylucene

    def setUp(self):

        # set up sample document
        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(directory)

示例#22

0

显示文件

文件： AdvancedQueryParserTest.py 项目： lauromoraes/pylucene

    def setUp(self):

        self.analyzer = WhitespaceAnalyzer()
        self.directory = RAMDirectory()

        writer = IndexWriter(self.directory, self.analyzer, True, 
                             IndexWriter.MaxFieldLength.LIMITED)

        for i in xrange(1, 501):
            doc = Document()
            doc.add(Field("id", NumberUtils.pad(i),
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

        writer.close()

示例#23

0

显示文件

文件： LuceneContainer.py 项目： Tadashi-Hikari/Chandler-Junkyard

    def indexReader(self, indexWriter, reader, uItem, uAttr, uValue, version):

        STORED = Field.Store.YES
        UN_INDEXED = Field.Index.NO
        UN_TOKENIZED = Field.Index.UN_TOKENIZED

        doc = Document()
        doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED))
        doc.add(Field("version", str(version), STORED, UN_INDEXED))
        reader = StringReader(reader.read())
        doc.add(Field("contents", reader, Field.TermVector.YES))

        indexWriter.addDocument(doc)

示例#24

0

显示文件

文件： KeywordAnalyzerTest.py 项目： lauromoraes/pylucene

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("description", "Illidium Space Modulator", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(self.directory, True)

示例#25

0

显示文件

    def write_index(self, workflow, property = False):
        """
        adds all keywords in workflow to the index at the specified location
        types of the keywords can be preserved by setting (Property = True)
        """
        self.ddict = dict()
        # name is used as id in this case
        self.adddd("workflow_id", str(workflow.id))

        self.adddd("text", workflow.id)
        self.adddd("text", workflow.name)
        # this is a key for the workflow
        #adddd("workflow_source", workflow.source)
        #self.adddd("text", workflow.source)
        #adddd("workflow_type", workflow.type)
        self.adddd("text", workflow.type)
        # not very interesting
        #d.add( Field("workflow_version", workflow.version, save, Field.Index.UN_TOKENIZED))
        self.indexAnnotations(workflow.annotations, property)

        for module in workflow.modules:    
            self.adddd("module_name" if property else "text", module.name)
            self.adddd("package"     if property else "text", module.package)
            # not very interesting
            #d.add( Field("package_version", module.version, save, Field.Index.UN_TOKENIZED))
            self.adddd("module_type" if property else "text", module.type)
            self.indexAnnotations(module.annotations, property)
            for p in module.parameters:
                self.adddd("parameter_name" if property else "text", p.name)
                self.adddd("parameter_value" if property else "text", p.value)
                self.adddd("parameter_type" if property else "text", p.type)
                self.indexAnnotations(p.annotations, property)

        for c in workflow.connections:    
            self.adddd("port_name" if property else "text", c.startPort)
            self.adddd("port_name" if property else "text", c.endPort)
            self.indexAnnotations(c.annotations, property)

        d = Document()
        for (k, v) in self.ddict.iteritems():
            d.add(Field(k, v, self.save, Field.Index.TOKENIZED))

        # Delete old versions
        WorkflowIndexer.writer.deleteDocuments(
            [Term('workflow_id', str(workflow.id))] )
        # add new
        WorkflowIndexer.writer.addDocument(d)

示例#26

0

显示文件

    def index(self):

        dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
                               "verbose-index")
        dir = FSDirectory.getDirectory(dirPath, True)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(System.out)

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

示例#27

0

显示文件

文件： T9er.py 项目： lauromoraes/pylucene

    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return

        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(
                Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("t9", cls.t9(word), Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("length", str(len(word)), Field.Store.NO,
                      Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.optimize()
        writer.close()

        reader.close()

示例#28

0

显示文件

文件： IndexFiles_v3.py 项目： SJTUCai/EE208-Intro2EE

    def indexDocs(self, root, writer):

        f = codecs.open('infoIndex.txt', 'r', encoding='utf-8')
        files = {}
        for line in f.xreadlines():
            ls = line.split()
            files[ls[0] + '.txt'] = [ls[1], ls[2]]
        f.close()

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print("adding"), filename
                # try:
                path = os.path.join(root, filename)
                file = open(path)
                contents = unicode(file.read(), 'utf-8')
                file.close()
                doc = Document()
                doc.add(
                    Field("name", filename, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                doc.add(
                    Field("path", path, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                url = files[filename][0]
                doc.add(
                    Field("url", url, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                domin = urlparse.urlsplit(url)[1].split(':')[0]
                doc.add(
                    Field("site", domin, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                title = files[filename][1]
                doc.add(
                    Field("title", title, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                print filename, path, url, domin, title
                if len(contents) > 0:
                    doc.add(
                        Field("contents", contents, Field.Store.YES,
                              Field.Index.ANALYZED))
                else:
                    print("warning: no content in %s" % filename)
                writer.addDocument(doc)

示例#29

0

显示文件

    def indexFile(self, writer, path):

        try:
            file = open(path)
            string = HTMLReader(InputStreamReader(file, 'utf-8')).read()
            file.close()
        except:
            raise
        else:
            doc = Document()
            doc.add(Field("contents", StringReader(string)))
            doc.add(
                Field("filename", os.path.abspath(path), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            return doc

示例#30

0

显示文件

文件： LuceneContainer.py 项目： Tadashi-Hikari/Chandler-Junkyard

    def indexValue(self, indexWriter, value, uItem, uAttr, uValue, version):

        STORED = Field.Store.YES
        UN_STORED = Field.Store.NO
        TOKENIZED = Field.Index.TOKENIZED
        UN_INDEXED = Field.Index.NO
        UN_TOKENIZED = Field.Index.UN_TOKENIZED

        doc = Document()
        doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED))
        doc.add(Field("version", str(version), STORED, UN_INDEXED))
        doc.add(
            Field("contents", value, UN_STORED, TOKENIZED,
                  Field.TermVector.YES))
        indexWriter.addDocument(doc)