Python Document.add示例，lucene.Document.add Python示例

示例#1

0

显示文件

文件： pylucene_test.py 项目： SamChen1981/spider-1

def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()

示例#2

0

显示文件

文件： app.py 项目： ProjectLISM/GUI

def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()

示例#3

0

显示文件

文件： MultiSearcherTest.py 项目： bpgriner01/pylucene

    def setUp(self):
        
        animals = [ "aardvark", "beaver", "coati",
                    "dog", "elephant", "frog", "gila monster",
                    "horse", "iguana", "javelina", "kangaroo",
                    "lemur", "moose", "nematode", "orca",
                    "python", "quokka", "rat", "scorpion",
                    "tarantula", "uromastyx", "vicuna",
                    "walrus", "xiphias", "yak", "zebra" ]

        analyzer = WhitespaceAnalyzer()

        aTOmDirectory = RAMDirectory()
        nTOzDirectory = RAMDirectory()

        aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)
        nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)

        for animal in animals:
            doc = Document()
            doc.add(Field("animal", animal,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))

            if animal[0].lower() < "n":
                aTOmWriter.addDocument(doc)
            else:
                nTOzWriter.addDocument(doc)

        aTOmWriter.close()
        nTOzWriter.close()

        self.searchers = [ IndexSearcher(aTOmDirectory),
                           IndexSearcher(nTOzDirectory) ]

示例#4

0

显示文件

文件： IndexTuningDemo.py 项目： ustramooner/python-lucenepp

    def main(cls, argv):

        if len(argv) < 5:
            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
            return
            
        docsInIndex  = int(argv[1])

        # create an index called 'index-dir' in a temp directory
        indexDir = os.path.join(tempfile.gettempdir(),
                                'index-dir')
        dir = FSDirectory.open(indexDir,)
        analyzer = SimpleAnalyzer()
        writer = IndexWriter(dir, analyzer, True)

        # set variables that affect speed of indexing
        writer.setMergeFactor(int(argv[2]))
        writer.setMaxMergeDocs(int(argv[3]))
        writer.setMaxBufferedDocs(int(argv[4]))
        # writer.infoStream = tempfile.out

        print "Merge factor:  ", writer.getMergeFactor()
        print "Max merge docs:", writer.getMaxMergeDocs()
        print "Max buffered docs:", writer.getMaxBufferedDocs()

        start = time()
        for i in xrange(docsInIndex):
            doc = Document()
            doc.add(Field("fieldname", "Bibamus",
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.close()
        print "Time: ", timedelta(seconds=time() - start)

示例#5

0

显示文件

文件： index.py 项目： guillelmo/UvA-AIR

    def index(source, indexName):

        if(not os.path.exists(indexName)):
            os.mkdir(indexName)
            
        indexDir = File(indexName)

        writer = IndexWriter(SimpleFSDirectory(File(indexName)),StandardAnalyzer(Version.LUCENE_CURRENT), True,IndexWriter.MaxFieldLength.LIMITED)
      
        p = re.compile("(GH\d+\-\d+)\n(.*?)\n+", re.DOTALL)
        res = p.findall(source)
        
        i = 0
        for pair in res:
            i += 1
            doc = Document()
            doc.add(Field("id", pair[0], Field.Store.YES, Field.Index.NO))
            for t in pair[1].split():
                doc.add(Field("content", t.replace("-","_"), Field.Store.NO, Field.Index.NOT_ANALYZED));
                #doc.add(Field("content", pair[1], Field.Store.NO, Field.Index.ANALYZED));
                
            writer.addDocument(doc)
            
        writer.close()
        print str(i)+ " docs indexed"

示例#6

0

显示文件

    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))

示例#7

0

显示文件

文件： indexer.py 项目： mefagan/relevancefeedback-

def createIndex():
    #initialize lucene and jvm
    print("started indexer")
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"

    
    #get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    #get index storage
    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
   
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    src_dir = 'html_files'
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        i += 1
        document, errors = parsehtml(data)
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    writer.optimize()
    writer.close()

示例#8

0

显示文件

文件： SpanQueryTest.py 项目： bpgriner01/pylucene

    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))

示例#9

0

显示文件

    def main(cls, argv):

        if len(argv) < 5:
            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
            return

        docsInIndex = int(argv[1])

        # create an index called 'index-dir' in a temp directory
        indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
                                'index-dir')
        dir = FSDirectory.getDirectory(indexDir, True)
        analyzer = SimpleAnalyzer()
        writer = IndexWriter(dir, analyzer, True)

        # set variables that affect speed of indexing
        writer.setMergeFactor(int(argv[2]))
        writer.setMaxMergeDocs(int(argv[3]))
        writer.setMaxBufferedDocs(int(argv[4]))
        # writer.infoStream = System.out

        print "Merge factor:  ", writer.getMergeFactor()
        print "Max merge docs:", writer.getMaxMergeDocs()
        print "Max buffered docs:", writer.getMaxBufferedDocs()

        start = time()
        for i in xrange(docsInIndex):
            doc = Document()
            doc.add(
                Field("fieldname", "Bibamus", Field.Store.YES,
                      Field.Index.TOKENIZED))
            writer.addDocument(doc)

        writer.close()
        print "Time: ", timedelta(seconds=time() - start)

示例#10

0

显示文件

文件： subEventPylucene.py 项目： kansal/Sub-Event-Detection

def index(string):
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 try:
  writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512))
 except lucene.JavaError:
  #print 'Inside Index Except'
  writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
#e = sys.exc_info()[0]
#print e
 #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

 doc = Document()
 doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
 writer.addDocument(doc)
 #print 'In the index function'
 #print writer.numDocs()

#print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
#print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
 writer.optimize()
#print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
#print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
 #print 'ending Indexing'
 #print string 
 #print 'Total indexes'
 #print writer.numDocs() 
 writer.close()

示例#11

0

显示文件

文件： IndexFeed.py 项目： subramgo/Vritti

    def indexfeeds(self,writer):
         """
         """
         feedlist=['http://today.reuters.com/rss/topNews',
                      'http://today.reuters.com/rss/domesticNews',
                      'http://today.reuters.com/rss/wordNews',
                      'http://rss.cnn.com/rss/edition.rss',
                      'http://rss.cnn.com/rss/edition_word.rss',
                      'http://rss.cnn.com/rss/edition_us.rss']     
         articletitles=[]
         for feed in feedlist:
                f=feedparser.parse(feed)
                for e in f.entries:
                    if e.title in articletitles: continue
                    contents = e.title.encode('utf8') + self.strphtml(e.description.encode('utf8'))
                    try:
				doc = Document()
				doc.add(Field("name", e.title,
	                                         Field.Store.YES,
	                                         Field.Index.NOT_ANALYZED))
				if len(contents) > 0:					
					doc.add(Field("contents", contents,
	                                            Field.Store.YES,
	                                            Field.Index.ANALYZED,
								    Field.TermVector.YES))
				writer.addDocument(doc)        
                    except Exception, e:
                        print 'Unable to index'

示例#12

0

显示文件

文件： app.py 项目： avinashkoulavkar/GUI

def configure_lucene():
    
    f = open('clique.txt','r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t','')
        line = line.replace('\r','')
        line = line.replace('\n','')
  	line = line.replace('^','')
    	line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()

示例#13

0

显示文件

文件： streamingIndexer.py 项目： greedo/TweetLinks

	def run(self):
		env.attachCurrentThread()
		stream = tweetstream.SampleStream("username", "password")

		for tweet in stream:
                	try:
									
				contents = unicode(tweet['text'])
				user_name = tweet['user']['screen_name']
				#print contents
				#print user_name
					
				doc = Document()
				doc.add(Field("user_name", user_name, Field.Store.YES, Field.Index.NOT_ANALYZED))
					
				if len(contents) > 0:
					doc.add(Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED))
				else:
					pass
				self.writer.addDocument(doc)
				
				# optimize for fast search and commit the changes
				self.writer.optimize()
				self.writer.commit()
			except Exception as e: pass

示例#14

0

显示文件

文件： addmetadata.py 项目： jlederluis/iqss-text-organizer

def add_new_document_with_metadata(writer,filepath,fieldnames,values):
    file = open(filepath)
    contents = unicode(file.read(), 'UTF-8')
    file.close()

    doc = Document()
    # add name, path, and contents fields
    doc.add(Field("name", os.path.basename(filepath),
                         Field.Store.YES,
                         Field.Index.NOT_ANALYZED))
    doc.add(Field("path", os.path.realpath(filepath),
                         Field.Store.YES,
                         Field.Index.NOT_ANALYZED))
    doc.add(Field("txtorg_id", str(uuid.uuid1()),
                         Field.Store.YES,
                         Field.Index.NOT_ANALYZED))
    if len(contents) > 0:
        doc.add(Field("contents", contents,
                             Field.Store.NO,
                             Field.Index.ANALYZED,
                             Field.TermVector.YES))
    else:
        print "warning: no content in %s" % filename

    for idx in range(len(fieldnames)):
        doc.add(Field(fieldnames[idx].lower(),values[idx].lower(),Field.Store.YES,Field.Index.NOT_ANALYZED))

    writer.addDocument(doc)

示例#15

0

显示文件

文件： indexutils.py 项目： jlederluis/iqss-text-organizer

def reindex_all(reader, writer, analyzer):
    for i in xrange(reader.maxDoc()):
        if reader.isDeleted(i): continue
        doc = reader.document(i)
        p = doc.get("path")
        pkid = doc.get('txtorg_id')
        if p is None:
            # No filepath specified, just use original document
            writer.updateDocument(Term("txtorg_id",pkid),doc,analyzer)
        else:
            # if a path field is found, try to read the file it points to and add a contents field
            edited_doc = Document()
            for f in doc.getFields():
                edited_doc.add(Field.cast_(f))

            try:
                inf = open(p)
                contents = unicode(inf.read(), 'UTF-8')
                inf.close()

                if len(contents) > 0:
                    edited_doc.add(Field("contents", contents,
                                         Field.Store.NO,
                                         Field.Index.ANALYZED,
                                         Field.TermVector.YES))
                else:
                    print "warning: no content in %s" % filename
            except:
                print "Could not read file; skipping"
            writer.updateDocument(Term("txtorg_id",pkid),edited_doc,analyzer)

示例#16

0

显示文件

    def _addDoc(self, text, writer):
        """
		function to add documents in the lucene index. 
		text fields are indexed by the name "field"
		"""

        doc = Document()
        doc.add(Field("field", text, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

示例#17

0

显示文件

文件： search.py 项目： JeffAMcGee/crowdy

 def addCrowd(self, id, text):
     doc = Document()
     doc.add(
         Field(CrowdFields.id, id, Field.Store.YES,
               Field.Index.NOT_ANALYZED))
     doc.add(
         Field(CrowdFields.text, text, Field.Store.YES,
               Field.Index.ANALYZED))
     self.writer.updateDocument(Term(CrowdFields.id, id), doc)

示例#18

0

显示文件

文件： RAMIndex.py 项目： subramgo/Vritti

 def addContents(self,contents):
      try:
           #iwconfig = IndexWriterConfig(SimpleAnalyzer(),IndexWriter.MaxFieldLength.LIMITED)
           writer = IndexWriter(self.ramIndex,SimpleAnalyzer(Version.LUCENE_CURRENT),True,IndexWriter.MaxFieldLength.LIMITED)
           for content in contents:
               doc = Document()
               doc.add(Field("contents",content[1],Field.Store.NO,Field.Index.ANALYZED,Field.TermVector.YES))
               writer.addDocument(doc)
           writer.close()
      except Exception,e:
           print 'Unable to add content to RAM index'

示例#19

0

显示文件

文件： ScoreTest.py 项目： lauromoraes/pylucene

    def indexSingleFieldDocs(self, fields):

        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        for field in fields:
            doc = Document()
            doc.add(field)
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

示例#20

0

显示文件

文件： ScoreTest.py 项目： qiugen/pylucene-trunk

    def indexSingleFieldDocs(self, fields):

        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        for field in fields:
            doc = Document()
            doc.add(field)
            writer.addDocument(doc)

        writer.commit()
        writer.close()

示例#21

0

显示文件

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, self.porterAnalyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("contents",
                      "The quick brown fox jumps over the lazy dogs",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

示例#22

0

显示文件

文件： PhraseQueryTest.py 项目： bpgriner01/pylucene

    def setUp(self):

        # set up sample document
        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("field", "the quick brown fox jumped over the lazy dog",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(directory)

示例#23

0

显示文件

文件： SecurityFilterTest.py 项目： lauromoraes/pylucene

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        # Elwood
        document = Document()
        document.add(
            Field("owner", "elwood", Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "elwoods sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        # Jake
        document = Document()
        document.add(
            Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "jakes sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        writer.close()

示例#24

0

显示文件

文件： AnchorIndex.py 项目： swalter2/knowledgeLexicalisation

 def index(self,path_to_index,path_files):
     'indexes anchor texts from a given folder'
     #lucene.initVM()
     indexDir = path_to_index
     directory_index = SimpleFSDirectory(File(indexDir))
     analyzer = StandardAnalyzer(Version.LUCENE_35)
     writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512))
     listOfPathes = []
     listOfPathes.extend(glob.glob(path_files+"*.txt"))
     counter = 0
     for path_to_file in listOfPathes:
         print path_to_file
         f = open(path_to_file,"r")
         for line in f:
             entry = line.split("\t")
             counter+=1
             """
             optimizes index after a certain amount of added documents
             """
             if counter%500000==0:
                 print counter
                 writer.optimize()
             doc = Document()
             doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED))
             writer.addDocument(doc)
         writer.optimize()
      
         f.close()
         
     writer.close()
     print counter
     print "done"

示例#25

0

显示文件

 def indexDocs(self, root, writer):
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             if not filename.endswith('.txt'):
                 continue
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 for line in file:
                     doc = Document()
                     arr = line.split('\t')
                     field = Field("name", arr[2].lower(), Field.Store.YES,
                                   Field.Index.TOKENIZED)
                     field.setBoost(1.5)
                     doc.add(field)
                     doc.add(
                         Field("alternate_names", arr[3].lower(),
                               Field.Store.YES, Field.Index.TOKENIZED))
                     doc.add(
                         Field("state", arr[10].lower(), Field.Store.YES,
                               Field.Index.TOKENIZED))
                     doc.add(
                         Field("population", arr[14], Field.Store.YES,
                               Field.Index.UN_TOKENIZED))
                     if int(arr[14]) > 1000000:
                         doc.setBoost(1.2)
                     writer.addDocument(doc)
                 file.close()
             except Exception, e:
                 print "Failed in indexDocs:", e

示例#26

0

显示文件

文件： luceneInx.py 项目： qiugen/pylucene_demo

def luceneIndexer(docdir,indir):
	""" IndexDocuments from a directory.
	Args:
		docdir:文档所在文件夹
		indir:索引存放文件夹
	Returns:
		无返回值
	说明：
	FieldType().setStored=as-is value stored in the Lucene index
	FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed
	FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted)
	FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion.
	"""
	
	"""#类型1属性：对于需要检索，需要返回显示setStored(True)
	type1 = FieldType()
	type1.setIndexed(True)
	type1.setStored(True)
	type1.setTokenized(False)
	type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
	#类型2属性：对于不用返回显示，但是需要进行检索的字段。这里我认为文本内容（content）是这一种的，通常例如文件的META信息。
	type2 = FieldType()
	type2.setIndexed(True)
	type2.setStored(False)
	type2.setTokenized(True)
	type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)"""
	
	lucene.initVM()
	DIRTOINDEX= docdir
	INDEXIDR= indir
	indexdir= SimpleFSDirectory(File(INDEXIDR))
	analyzer= StandardAnalyzer(Version.LUCENE_30)
	#用指定的语言分析器构造一个新的写索引器.
	index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
	    #print "Indexing: "
		print "Indexing:", tfile;
		document = Document()
		content = open(tfile,'r').read()
		#类型使用方式
		#doc.add(Field("path", tfile, type1))
		
		#文档新增字段（Field）{字段名："text",存储：“YES”,索引:"YES"}
		document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
		document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED))
		index_writer.addDocument(document)
		print "Done: ", tfile
	index_writer.optimize()
	print index_writer.numDocs()
	index_writer.close()

示例#27

0

显示文件

文件： docindexjob.py 项目： tongji1907/woyaooo

def do_index():
    initVM()
    indexDir = "/home/william/woyaoo/luceneindex"
    version = Version.LUCENE_CURRENT
    standardAnalyzer = StandardAnalyzer(version)
    # chineseAnalyzer = CJKAnalyzer(version)
    engine = data.engine_from_config("indexdb.config")
    # engine = data.engine_from_config()
    db = data.init_datafactory(engine)
    docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all()
    print len(docs)
    idxDir = SimpleFSDirectory(File(indexDir))
    perIndexCount = 5000
    writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512))

    # add field
    for doc in docs:
        # print repr(doc.description)
        lucenedoc = Document()
        descriptionValue = doc.description.strip("\r\n").encode("UTF-8")
        # descriptionValue ='中国 abc'
        print repr(descriptionValue)
        lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED))
        lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED))
        # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(lucenedoc)
        writer.optimize()
    writer.close()
    print "index finished"

示例#28

0

显示文件

    def addDocuments(self, dir):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        #
        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs
        #

        for word in self.docs:
            doc = Document()
            doc.add(
                Field("keyword", word, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO))
            doc.add(
                Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

示例#29

0

显示文件

文件： IndexFilesForImage_v3.py 项目： SJTUCai/EE208-Intro2EE

    def indexDocs(self, root, writer):

        f = codecs.open('picIndex.txt','r',encoding='utf-8')
        picDict = {}
        for line in f.xreadlines():
            ls = line.split('seg^*')
            url = ls[0]
            title = ls[1] 
            src = ls[2]
            alt = ls[3]
            picDict[src] = [url,title,alt]
        f.close()
        for src in picDict:
            doc = Document()
            doc.add(Field("src", src,
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("url", picDict[src][0],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("title", picDict[src][1],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("alt", picDict[src][2],
                                 Field.Store.YES,
                                 Field.Index.ANALYZED))
            writer.addDocument(doc)

示例#30

0

显示文件

文件： CompoundVersusMultiFileIndexTest.py 项目： ustramooner/python-lucenepp

    def addDocuments(self, dir, isCompound):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(isCompound)

        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs

        for word in self.docs:
            doc = Document()
            doc.add(Field("keyword", word,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("unstored", word,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

示例#31

0

显示文件

文件： IndexFiles.py 项目： christineyen/location-extraction

 def indexDocs(self, root, writer):
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             if not filename.endswith('.txt'):
                 continue
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 for line in file:
                     doc = Document()
                     arr = line.split('\t')
                     field = Field("name", arr[2].lower(),
                                          Field.Store.YES,
                                          Field.Index.TOKENIZED)
                     field.setBoost(1.5)
                     doc.add(field)
                     doc.add(Field("alternate_names", arr[3].lower(),
                                          Field.Store.YES,
                                          Field.Index.TOKENIZED))
                     doc.add(Field("state", arr[10].lower(),
                                          Field.Store.YES,
                                          Field.Index.TOKENIZED))
                     doc.add(Field("population", arr[14],
                                          Field.Store.YES,
                                          Field.Index.UN_TOKENIZED))
                     if int(arr[14]) > 1000000:
                         doc.setBoost(1.2)
                     writer.addDocument(doc)
                 file.close()
             except Exception, e:
                 print "Failed in indexDocs:", e

示例#32

0

显示文件

文件： PhraseQueryTest.py 项目： lauromoraes/pylucene

    def setUp(self):

        # set up sample document
        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(directory)

示例#33

0

显示文件

文件： AdvancedQueryParserTest.py 项目： lauromoraes/pylucene

    def setUp(self):

        self.analyzer = WhitespaceAnalyzer()
        self.directory = RAMDirectory()

        writer = IndexWriter(self.directory, self.analyzer, True, 
                             IndexWriter.MaxFieldLength.LIMITED)

        for i in xrange(1, 501):
            doc = Document()
            doc.add(Field("id", NumberUtils.pad(i),
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

        writer.close()

示例#34

0

显示文件

文件： KeywordAnalyzerTest.py 项目： qiugen/pylucene-trunk

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("partnum", "Q36",
                      Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("description", "Illidium Space Modulator",
                      Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = self.getSearcher()

示例#35

0

显示文件

文件： PlainTextHandler.py 项目： bpgriner01/pylucene

    def indexFile(self, writer, path):

        try:
            reader = InputStreamReader(FileInputStream(path), 'iso-8859-1')
        except JavaError:
            raise
        else:
            doc = Document()
            doc.add(Field("contents", reader))
            doc.add(Field("filename", os.path.abspath(path),
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)
            reader.close()

            return doc

示例#36

0

显示文件

文件： KeywordAnalyzerTest.py 项目： lauromoraes/pylucene

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("description", "Illidium Space Modulator", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(self.directory, True)

示例#37

0

显示文件

文件： HTMLHandler.py 项目： bpgriner01/pylucene

    def indexFile(self, writer, path):

        try:
            file = open(path)
            string = HTMLReader(InputStreamReader(file, 'utf-8')).read()
            file.close()
        except:
            raise
        else:
            doc = Document()
            doc.add(Field("contents", StringReader(string)))
            doc.add(Field("filename", os.path.abspath(path),
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            return doc

示例#38

0

显示文件

    def write_index(self, workflow, property = False):
        """
        adds all keywords in workflow to the index at the specified location
        types of the keywords can be preserved by setting (Property = True)
        """
        self.ddict = dict()
        # name is used as id in this case
        self.adddd("workflow_id", str(workflow.id))

        self.adddd("text", workflow.id)
        self.adddd("text", workflow.name)
        # this is a key for the workflow
        #adddd("workflow_source", workflow.source)
        #self.adddd("text", workflow.source)
        #adddd("workflow_type", workflow.type)
        self.adddd("text", workflow.type)
        # not very interesting
        #d.add( Field("workflow_version", workflow.version, save, Field.Index.UN_TOKENIZED))
        self.indexAnnotations(workflow.annotations, property)

        for module in workflow.modules:    
            self.adddd("module_name" if property else "text", module.name)
            self.adddd("package"     if property else "text", module.package)
            # not very interesting
            #d.add( Field("package_version", module.version, save, Field.Index.UN_TOKENIZED))
            self.adddd("module_type" if property else "text", module.type)
            self.indexAnnotations(module.annotations, property)
            for p in module.parameters:
                self.adddd("parameter_name" if property else "text", p.name)
                self.adddd("parameter_value" if property else "text", p.value)
                self.adddd("parameter_type" if property else "text", p.type)
                self.indexAnnotations(p.annotations, property)

        for c in workflow.connections:    
            self.adddd("port_name" if property else "text", c.startPort)
            self.adddd("port_name" if property else "text", c.endPort)
            self.indexAnnotations(c.annotations, property)

        d = Document()
        for (k, v) in self.ddict.iteritems():
            d.add(Field(k, v, self.save, Field.Index.TOKENIZED))

        # Delete old versions
        WorkflowIndexer.writer.deleteDocuments(
            [Term('workflow_id', str(workflow.id))] )
        # add new
        WorkflowIndexer.writer.addDocument(d)

示例#39

0

显示文件

文件： lucene_index.py 项目： clintpgeorge/ediscovery

def lucene_index(input_folder,output_folder):
    '''
    Indexes fresh text data using lucene 3.6.
    Doesn't support incremental generation of index as of now.
    Currently crashes on neo by running out of heap space.
    Arguments: Input folder for text files. output folder for index location 
    Returns: void. The index is stored if generated.
    
    
    '''
    
    # Setting up log file
    logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log"))
    logging.info("Input directory for logging: "+input_folder)
    logging.info("Output directory of index: "+output_folder)
    if  not os.path.isdir(output_folder):
        logger.debug("Making output directory for index: "+ output_folder)
        os.makedirs(output_folder)
    
    # Setting up lucene's heap size for index and version of indexer
    lucene.initVM(initialheap='1024m',maxheap='2048m')
    index_folder = SimpleFSDirectory(File(output_folder))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
    
    # Optimization to reduce heap space usage for generation of index. Merges buffer with
    # current index after 15 docs.
    writer.setMergeFactor(15) 
    writer.setRAMBufferSizeMB(32.0)
    
    # Search to find the files to index
    files_to_index = find_files_in_folder(input_folder) 
    for input_file in files_to_index:
        doc = Document()
        content = open(input_file, 'r').read()
        doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index.
        doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file
        writer.addDocument(doc) # Index

    logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs()))
    logger.info( "About to optimize index of %d documents..." % writer.numDocs())
    writer.optimize() # Compress index
    logger.info("...done optimizing index of %d documents" % writer.numDocs())
    logger.info("Closing index of %d documents..." % writer.numDocs())
    writer.close()
    logger.info("Closed index")

示例#40

0

显示文件

文件： indexer.py 项目： liuyang1/test

def Indexer(docdir,indir):
	lucene.initVM()
	DIRTOINDEX   = docdir
	INDEXDIR     = indir
	indexdir     = FSDirectory(File(INDEXDIR))
	analyzer     = StandardAnalyzer(VERSION.LUCENE_30)
	index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
		print "Indexing ",tfile
		document=Document()
		content = open(tfile,'r').read()
		document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
		index_writer.addDocument(document)
		print "Done"
	index_writer.optimize()
	print index_writer.numDocs()
	index_writer.close()

示例#41

0

显示文件

    def indexFile(self, writer, path):

        try:
            file = open(path)
            string = HTMLReader(InputStreamReader(file, 'utf-8')).read()
            file.close()
        except:
            raise
        else:
            doc = Document()
            doc.add(Field("contents", StringReader(string)))
            doc.add(
                Field("filename", os.path.abspath(path), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            return doc

示例#42

0

显示文件

文件： VerboseIndexing.py 项目： pombredanne/python-lucenepp

    def index(self):

        dirPath = os.path.join(tempfile.gettempdir(),
                               "verbose-index")
        dir = FSDirectory.open(dirPath)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(InfoStreamOut())

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

示例#43

0

显示文件

    def index(self):

        dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
                               "verbose-index")
        dir = FSDirectory.getDirectory(dirPath, True)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(System.out)

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

示例#44

0

显示文件

文件： MultiPhraseQueryTest.py 项目： bpgriner01/pylucene

    def setUp(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc1 = Document()
        doc1.add(Field("field", "the quick brown fox jumped over the lazy dog",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc1)

        doc2 = Document()
        doc2.add(Field("field", "the fast fox hopped over the hound",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc2)
        writer.close()

        self.searcher = IndexSearcher(directory, True)

示例#45

0

显示文件

文件： PlainTextHandler.py 项目： ustramooner/python-lucenepp

    def indexFile(self, writer, path):
        f = None
        try:
            f = codecs.open(path, encoding='utf-8')
            text = f.read()
        except Exception:
            raise
        finally:
            if f != None:
              f.close()

        doc = Document()
        doc.add(Field("contents", StringReader(text)))
        doc.add(Field("filename", os.path.abspath(path),
                      Field.Store.YES, Field.Index.NOT_ANALYZED))
        writer.addDocument(doc)

        return doc

示例#46

0

显示文件

def luceneIndexer(docdir, indir):
    """

         IndexDocuments from a directory

         """

    lucene.initVM()

    DIRTOINDEX = docdir

    INDEXIDR = indir

    indexdir = SimpleFSDirectory(File(INDEXIDR))

    analyzer = StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))

    for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):

        print "Indexing: ", tfile

        document = Document()

        content = open(tfile, 'r').read()

        document.add(Field("text",content,Field.Store.YES,\

                 Field.Index.ANALYZED))

        index_writer.addDocument(document)

        print "Done: ", tfile

    index_writer.optimize()

    print index_writer.numDocs()

    index_writer.close()

示例#47

0

显示文件

    def someMethod(self):

        directory = RAMDirectory()

        analyzer = StandardAnalyzer()
        writer = IndexWriter(directory, analyzer, True)

        doc = Document()
        doc.add(Field.Text("title", "This is the title"))
        doc.add(Field.UnStored("contents", "...document contents..."))
        writer.addDocument(doc)

        writer.addDocument(doc, analyzer)

        expression = "some query"

        query = QueryParser.parse(expression, "contents", analyzer)

        parser = QueryParser("contents", analyzer)
        query = parser.parseQuery(expression)

示例#48

0

显示文件

    def setUp(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc1 = Document()
        doc1.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc1)

        doc2 = Document()
        doc2.add(
            Field("field", "the fast fox hopped over the hound",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc2)
        writer.close()

        self.searcher = IndexSearcher(directory, True)

示例#49

0

显示文件

    def indexFile(self, writer, path):

        doc = Document()

        try:
            process = popen2.Popen4(["antiword", "-m", "UTF-8", path])
            string = InputStreamReader(process.fromchild, 'utf-8').read()
        except:
            raise
        else:
            doc.add(Field("contents", StringReader(string)))
            doc.add(Field("filename", os.path.abspath(path),
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            exitCode = process.wait()
            if exitCode != 0:
                raise RuntimeError, "pdftotext exit code %d" %(exitCode)

            return doc

示例#50

0

显示文件

文件： DocumentUpdateTest.py 项目： lauromoraes/pylucene

    def testUpdate(self):

        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))

        reader = IndexReader.open(self.dir, False)
        reader.deleteDocuments(Term("city", "Amsterdam"))
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO))
        doc.add(
            Field("contents", "St. Petersburg has lots of bridges",
                  Field.Store.NO, Field.Index.ANALYZED))
        doc.add(
            Field("city", "St. Petersburg", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.optimize()
        writer.close()

        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
        self.assertEqual(1, self.getHitCount("city", "Petersburg"))

示例#51

0

显示文件

def indexDocuments():
    # empty index directory
    indexDir = Wikipedia.directory + 'index/'
    for filename in os.listdir(indexDir):
        os.remove(indexDir + filename)

    # index documents
    lucene.initVM()
    version = Version.LUCENE_CURRENT
    analyzer = EnglishAnalyzer(version)
    writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True,
                         IndexWriter.MaxFieldLength.LIMITED)

    for article in Wikipedia():
        doc = Document()
        doc.add(
            Field('id', str(article['id'][0]), Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('title', article['url'], Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('content', article['text'], Field.Store.NO,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)

    print 'Optimization'
    writer.optimize()
    writer.close()

示例#52

0

显示文件

文件： LuceneContainer.py 项目： Tadashi-Hikari/Chandler-Junkyard

    def indexReader(self, indexWriter, reader, uItem, uAttr, uValue, version):

        STORED = Field.Store.YES
        UN_INDEXED = Field.Index.NO
        UN_TOKENIZED = Field.Index.UN_TOKENIZED

        doc = Document()
        doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED))
        doc.add(Field("version", str(version), STORED, UN_INDEXED))
        reader = StringReader(reader.read())
        doc.add(Field("contents", reader, Field.TermVector.YES))

        indexWriter.addDocument(doc)

示例#53

0

显示文件

    def setUp(self):

        animals = [
            "aardvark", "beaver", "coati", "dog", "elephant", "frog",
            "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur",
            "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion",
            "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak",
            "zebra"
        ]

        analyzer = WhitespaceAnalyzer()

        aTOmDirectory = RAMDirectory()
        nTOzDirectory = RAMDirectory()

        aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)
        nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)

        for animal in animals:
            doc = Document()
            doc.add(
                Field("animal", animal, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))

            if animal[0].lower() < "n":
                aTOmWriter.addDocument(doc)
            else:
                nTOzWriter.addDocument(doc)

        aTOmWriter.close()
        nTOzWriter.close()

        self.searchers = [
            IndexSearcher(aTOmDirectory),
            IndexSearcher(nTOzDirectory)
        ]

示例#54

0

显示文件

文件： indexize.py 项目： wncios/Readers

def luceneIndexer(docdir, indir):
    """frpFile
    IndexDocuments from a directory

    para:{
        docdir: the path of the txt file
        indir: the path of the index file which is generated by the following code
        }
    """

    lucene.initVM()
    DIRTOINDEX = docdir
    INDEXIDR = indir
    indexdir = SimpleFSDirectory(File(INDEXIDR))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    index_writer = IndexWriter(indexdir, analyzer, True, \
                               IndexWriter.MaxFieldLength(512))
    #for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):
    list = os.listdir(DIRTOINDEX)
    for i in range(len(list)):
        tfile = os.path.join(DIRTOINDEX, list[i])
        if os.path.isfile(tfile):
            print ("Indexing: ", tfile)
            print ('okokokook')
            document = Document()
            content = open(tfile, 'r').read()
            document.add(Field("text", content, Field.Store.YES, \
                               Field.Index.ANALYZED))
            document.add(Field("title",str(tfile.strip('.txt')),Field.Store.YES,\
                               Field.Index.ANALYZED))
            index_writer.addDocument(document)
            #print (document)
            print ("Done: ", tfile)
    index_writer.optimize()
    print (index_writer.numDocs())
    index_writer.close()

示例#55

0

显示文件

文件： LuceneContainer.py 项目： Tadashi-Hikari/Chandler-Junkyard

    def indexValue(self, indexWriter, value, uItem, uAttr, uValue, version):

        STORED = Field.Store.YES
        UN_STORED = Field.Store.NO
        TOKENIZED = Field.Index.TOKENIZED
        UN_INDEXED = Field.Index.NO
        UN_TOKENIZED = Field.Index.UN_TOKENIZED

        doc = Document()
        doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED))
        doc.add(Field("version", str(version), STORED, UN_INDEXED))
        doc.add(
            Field("contents", value, UN_STORED, TOKENIZED,
                  Field.TermVector.YES))
        indexWriter.addDocument(doc)

示例#56

0

显示文件

 def addDocuments(self, _id, title, content):
     doc = Document()
     doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED))
     if title is not None and len(title) > 0:
         doc.add(
             Field("titleKeyword", title, Field.Store.NO,
                   Field.Index.ANALYZED))
     if content is not None and len(content) > 0:
         doc.add(
             Field("contentKeyword", content, Field.Store.NO,
                   Field.Index.ANALYZED))
     self.index_writer.addDocument(doc)