def index(): # Initialize lucene and the JVM # lucene.initVM() GLOBALDIRECTORY = getDirectory() #Indexwriter config analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, tokenCount) config = IndexWriterConfig(analyzer) writer = IndexWriter(GLOBALDIRECTORY, config) fileNames = getTxtFile(textfileDirectory) #creates document for each tweet fileNames = getTxtFile(textfileDirectory) #creates document for each tweet for file in fileNames: data = getData(file) for tweets in data: if 'text' in tweets: doc = createDocument_tweet(tweets) writer.addDocument(doc) # add the document to IndexWriter print file print "\nNumber of indexed documents: %d" % writer.numDocs( ) #number of documents indexed for testing writer.close() print "Indexing done!\n" print "------------------------------------------------------" return GLOBALDIRECTORY
class LuceneIndexer: def __init__(self, path_to_save): self.path_to_save = path_to_save self.num_docs = 0 lucene.initVM() self.indexDir = SimpleFSDirectory(File(self.path_to_save)) self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1) self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2) self.writer = IndexWriter(self.indexDir, self.writerConfig) def add_document(self, fields, header, id_): doc = Document() if len(fields) > len(header): sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_)) for field in fields: sys.stderr.write('%s\n' % field) return for idx, field in enumerate(fields): fname, fieldtype = header[idx] if fieldtype is IntField: field = int(field) doc.add(fieldtype(fname, field, Field.Store.YES)) self.writer.addDocument(doc) self.num_docs += 1 def close(self): print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs()) self.writer.close()
def index(indexdir): lucene.initVM() indexDir = SimpleFSDirectory(File(indexdir)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer()) writer = IndexWriter(indexDir, writerConfig) f = open('data/docid.documento-xml.txt') st = PorterStemmer() for i, line in enumerate(f.readlines()): id, xmltext = line.split('\t') xmltext = xmltext.rstrip('\n') xmldoc = minidom.parseString(xmltext) title = xmldoc.getElementsByTagName("TITLE") title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue authors = xmldoc.getElementsByTagName("AUTHORS") authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue abstract = xmldoc.getElementsByTagName("ABSTRACT") abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue doc = Document() doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) print "indexed %s docs" % (i+1) writer.close()
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def create_index(storage, paths) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths : for filen in os.listdir(path) : text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3) : doc = Document() a = i-5 if i-5 > 0 else 0 sentence = ' '.join(text[a:i+5]) doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path+filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def index(personDB, familyDB, relationDB): #config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(indexDir, config) #?#indexWriter.setRAMBufferSizeMB(50); KOLLA 256 mt = matchtext() for p in personDB.find({}, no_cursor_timeout=True): matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB) doc = Document() doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED)) doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED)) doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED)) writer.addDocument(doc) #Family matchtext for f in familyDB.find(): matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB) doc = Document() doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED)) doc.add(Field('sex','FAM', StringField.TYPE_STORED)) doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED)) writer.addDocument(doc) writer.commit() writer.close() return
def wikipedia_indexer(storage, wikipedia_file): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f): text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0: print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def testAdd(self, filepath): config = IndexWriterConfig(Version.LUCENE_CURRENT, self.getAnalyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) #True,建立新索引,False,建立增量索引 file = open(filepath) contents = unicode(file.read(), 'gbk') file.close() doc = Document() doc.add(Field("name", os.path.basename(filepath), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("path", filepath, Field.Store.YES, Field.Index.NOT_ANALYZED)) if len(contents) > 0: title = self.getTxtAttribute(contents, 'Title') author = self.getTxtAttribute(contents, 'Author') language = self.getTxtAttribute(contents, 'Language') doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Author", author, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Language", language, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("contents", contents, Field.Store.NO, Field.Index.ANALYZED)) else: print "warning: no content in %s" % filename writer.addDocument(doc) writer.close()
class _ChineseRamIndexer: def __init__(self): indexDir = RAMDirectory() analyzer = SmartChineseAnalyzer() writerConfig = IndexWriterConfig(analyzer) # create new directory, remove previously indexed documents writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writerConfig.setSimilarity(mySimilarity()) logger.debug('search similarity:{}'.format( writerConfig.getSimilarity())) self.indexDir = indexDir self.writer = IndexWriter(indexDir, writerConfig) def add(self, pid, content): doc = Document() doc.add(StringField("pid", pid, Field.Store.YES)) doc.add(TextField("content", content, Field.Store.YES)) self.writer.addDocument(doc) def close(self): self.writer.close() def index_lesson(self, parags): for index, content in enumerate(parags): pid = 'p' + str(index) self.add(pid, content) self.close()
def publish_services(self, service_list): transformer = WSDLTransformer() current_document = 1 indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig( Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT)) writerConfig.setSimilarity(BM25Similarity()) index_writer = IndexWriter(indexDir, writerConfig) for wsdl in service_list: if self._document_expansion: #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl)))) bag_of_words = ' '.join( self._semantic_transformer.transform( transformer.transform(wsdl))) else: #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl))) bag_of_words = ' '.join(transformer.transform(wsdl)) doc = Document() doc.add( Field("content", bag_of_words, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO)) index_writer.addDocument(doc) current_document += 1 index_writer.close()
def create_index(storage, paths): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths: for filen in os.listdir(path): text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3): doc = Document() a = i - 5 if i - 5 > 0 else 0 sentence = ' '.join(text[a:i + 5]) doc.add( Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path + filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def wikipedia_indexer(storage, wikipedia_file) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f) : text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0 : print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def build_index(document_path, dir_path): lucene.initVM() index_dir = SimpleFSDirectory(Paths.get(dir_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) index_writer = IndexWriter(index_dir, config) t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(False) with open(document_path) as input_file: for line in input_file: segs = line.strip().split(" ") music_path, music_tags = segs[0], segs[1].split(",") document = Document() document.add(Field("content", " ".join(music_tags), t1)) document.add(Field("url", music_path, t2)) index_writer.addDocument(document) index_writer.close()
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % ( n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def index (cls, indexDir, taxoDir): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer from org.apache.lucene.util import Version config = IndexWriterConfig(Version.LUCENE_42, WhitespaceAnalyzer(Version.LUCENE_42)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # FacetFields is a utility class for adding facet fields to a document: facet_fields = FacetFields(taxo) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # obtain the sample facets for current document facets = categories[docNum] facetList = [CategoryPath(f) for f in facets] # NOTE: setCategoryPaths() requires an Iterable, so need to convert the # Python list in order to to pass a proper argument to setCategoryPaths. # We use java.util.Arrays (via JCC) to create a Java List: facetList = Arrays.asList(facetList) # NOTE: we could use lucene.collections here as well in order to convert our # Python list to a Java based list using the JavaList class (JavaList implements # java.util.List around a Python list instance it wraps): # from lucene.collections import JavaList # facetList = JavaList(facetList) # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # use the FacetFields utility class for adding facet fields (i.e. the categories) # to the document (and, as required, to the taxonomy index) facet_fields.addFields(doc, facetList) # finally add the document to the index iw.addDocument(doc) nDocsAdded +=1 nFacetsAdded += facetList.size() # end for # commit changes. # we commit changes to the taxonomy index prior to committing them to the search index. # this is important, so that all facets referred to by documents in the search index # will indeed exist in the taxonomy index. taxo.commit() iw.commit() # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. taxo.close() iw.close() print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
class TAindexer(object): INDEX_FILED_TITLE = "Title" INDEX_FILED_POSTID = "PostTypeId" INDEX_FILED_AC = "AnswerCount" def __init__(self, dir, data_file): self.dir = dir self.data_file = data_file index_dir = FSDirectory.open(Paths.get(self.dir)) analyzer = StandardAnalyzer() writer_config = IndexWriterConfig(analyzer) writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(index_dir, writer_config) def get_document(self, title, post_id, ac): doc = Document() doc.add(TextField(self.INDEX_FILED_TITLE, title, Field.Store.YES)) doc.add(TextField(self.INDEX_FILED_POSTID, post_id, Field.Store.YES)) doc.add(TextField(self.INDEX_FILED_AC, ac, Field.Store.YES)) return doc def index_TAs(self): start_time = datetime.now() print "Start to index " + self.data_file + " @ " + str(start_time) count = 0 for event, elem in ET.iterparse(self.data_file): print datetime.now() if event == 'end': if elem.get('Title'): title = elem.get('Title') else: title = '' if elem.get('PostTypeId'): post_id = elem.get('PostTypeId') else: post_id = '' if elem.get('AnswerCount'): answer_count = elem.get('AnswerCount') else: answer_count = '' print post_id print answer_count if str(post_id) == '1': print self.dir doc = self.get_document(title.encode("utf-8"), post_id.encode("utf-8"), answer_count.encode("utf-8")) self.writer.addDocument(doc) print count count = count + 1 elem.clear() self.writer.close() print "Indexing " + self.data_file + " complete @ " + str( datetime.now()) print "Indexing time is " + str(datetime.now() - start_time)
def index(self, data): writer = IndexWriter( self.d, self.conf) doc = self.buildDocument(data['fields'], data['record']) writer.addDocument(doc) writer.commit() writer.close()
class Indexer(object): def __init__(self, docDir, indexDir, analyzer): #set index dir if not os.path.exists(indexDir): os.makedirs(indexDir) self.indexDir = SimpleFSDirectory(Paths.get(indexDir)) self.docDir = docDir self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) writerConfig = IndexWriterConfig(self.analyzer) writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.indexDir, writerConfig) self.indexing() def indexing(self): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.NONE) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) for filename in os.listdir(self.docDir): if filename.endswith('.html') or filename.endswith('.htm'): with open(os.path.join(self.docDir, filename)) as f: url = f.readline().strip() htmlString = f.read() #remove HTML markup soup = BeautifulSoup(htmlString, 'html.parser') # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) #text = soup.get_text().strip() title = soup.title.string #print text doc = Document() doc.add(Field("link", url, t1)) doc.add(Field("title", title, t1)) doc.add(Field("text", text, t2)) self.writer.addDocument(doc) print "index document", filename self.writer.commit() self.writer.close()
def rebuildIndex(self, data): writer = IndexWriter( self.d, self.conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)) for record in data['records']: doc = self.buildDocument(data['fields'], record) writer.addDocument(doc) writer.commit() writer.close()
def index_files(): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() BASE_DIR = path.dirname(path.abspath(sys.argv[0])) INPUT_DIR = BASE_DIR + "/input/" INDEX_DIR = BASE_DIR + "/lucene_index/" NoT = 100000 # Number of Tokens print "------------------------------------------------------" print "PyLucene Demo started (lucene_demo.py)" print "Python version: %d.%d.%d" % ( sys.version_info.major, sys.version_info.minor, sys.version_info.micro) print 'Lucene version:', lucene.VERSION print "------------------------------------------------------\n" # lucene.initVM() # directory = RAMDirectory() index_path = Paths.get(INDEX_DIR) directory = SimpleFSDirectory(index_path) analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, NoT) config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) print "Number of indexed documents: %d\n" % writer.numDocs() for input_file in listdir(INPUT_DIR): # iterate over all input files print "Current file:", input_file if input_file.endswith(".json"): with open(INPUT_DIR + input_file) as f: for line in f: # doc = create_document(line, input_file) # call the create_document function o = json.loads(line) doc = Document() # create a new document doc.add(TextField("filename", input_file, Field.Store.YES)) # print file doc.add( TextField("username", o['user']['screen_name'], Field.Store.YES)) # print "username: "******"text", o['text'], Field.Store.YES)) # print "text: " + o['text'] if o['user']['location']: doc.add( TextField("location", o['user']['location'], Field.Store.YES)) # print "location: " + o['user']['location'] doc.add(TextField("time", o['created_at'], Field.Store.YES)) writer.addDocument( doc) # add the document to the IndexWriter print "\nNumber of indexed documents: %d" % writer.numDocs() writer.close() print "Finished\n" print "-----------------------------------------------------"
class Indexer(object): def __init__(self, **kwargs): """ Initialize a new instance of the Indexer :param output: The output directory of the underlying index :param anaylzer: The overloaded analyzer to work with """ self.output = kwargs.get("root", "index") if not os.path.exists(self.output): os.mkdir(self.output) self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT)) self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.store = SimpleFSDirectory(File(self.output)) self.writer = IndexWriter(self.store, self.config) self.create_field_types() def index(self, document): """ Given a new document, add it to the index. :param document: The document to add to the indexer """ try: self.writer.addDocument(document) except Exception: logger.exception("Failed to index the supplied document") def shutdown(self): """ Shutdown the currently processing indexer. """ try: # self.writer.optimize() self.writer.close() except Exception: logger.exception("Failed to shutdown the indexer correctly") def create_field_types(self): """ Create the field types that will be used to specify what actions lucene should take on the various fields supplied to index. """ self.field_clean = FieldType() self.field_clean.setIndexed(True) self.field_clean.setStored(True) self.field_clean.setTokenized(False) self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.field_dirty = FieldType() self.field_dirty.setIndexed(True) self.field_dirty.setStored(False) self.field_dirty.setTokenized(True) self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def indexer(docNumber, docText): lucene.initVM() indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer()) writer = IndexWriter(indexDir, writerConfig) doc = Document() doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print "Closing index of %d docs..." % writer.numDocs() writer.close()
class WikiPageIndex(): def __init__(self, index_dir): #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g']) self.index_dir = index_dir self.directory = SimpleFSDirectory(File(self.index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) def createIndex(self): self.writer = IndexWriter(self.directory, self.config) if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) def addDocumentToIndex(self, title, text): doc = Document() doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.addDocument(doc) def closeIndex(self): self.writer.commit() self.writer.close() def searchIndex(self, queryString, field="Text", max_results=100): query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(queryString) scoreDocs = self.searcher.search(query, max_results).scoreDocs log.debug("Found {0} documents for query [{1}]".format(len(scoreDocs), queryString)) docs = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) log.debug(WikiPageIndex.cleanWikiText(doc.get("Text"))) #print("title: {0}\ncontents: {1}".format(doc.get("Title"), doc.get("Text")[:70])) docs.append(doc) return docs @staticmethod def cleanWikiText(text): text = text.encode('ascii', 'ignore') text = re.sub('(\[\[.*?\]\]|\{\{.*?\}\}|\{\|.*?\|\})', '', text) text = re.sub('[^\na-zA-Z0-9\n_-]+', ' ', text) text = re.sub('([ \t]*[\n]+[ \t]*)+', '\n', text) return text.strip()
def retrival_answer(MAX): lucene.initVM() directory = RAMDirectory() indexDir = SimpleFSDirectory(Paths.get('index')) writerConfig = IndexWriterConfig(StandardAnalyzer()) writer = IndexWriter(directory, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from Document..." process_doc = open("Huawei_result/document.txt", "r") doc_line = process_doc.readlines() for l in doc_line: doc = Document() doc.add(TextField("text", l, Field.Store.YES)) writer.addDocument(doc) print "Indexed from %d docs in index" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close() accuracy = [] process_query = open("Huawei_result/query.txt", "r") query_line = process_query.readlines() for n, one_query in enumerate(query_line): analyzer = StandardAnalyzer() # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index'))) searcher = IndexSearcher(DirectoryReader.open(directory)) # searcher = IndexSearcher(reader) query = QueryParser("text", analyzer).parse(one_query) hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) # print "The groundtruth document is:", doc_line[n] candidate_doc = [] for hit in hits.scoreDocs: # print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) # print doc.get("text").encode("utf-8") candidate_doc.append(doc.get("text")) choices = process.extract(unicode(doc_line[n]), candidate_doc) flag = 0 for i in range(len(choices)): if choices[i][1] >= 89: flag = 1 if flag == 1: accuracy.append(1) else: accuracy.append(0) final_accuracy = float(sum(accuracy)) / float(len(accuracy)) print "the final accuracy is:", final_accuracy
class PyLuceneRetrievalSystem(object): def __init__(self): self.num_doc = 0 self.directory = RAMDirectory() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.writer = IndexWriter(self.directory, self.config) def parse_articles(self, file_name): raw_articles = open(file_name).read() parsed_articles = BeautifulSoup(raw_articles, 'lxml') doc_nos = parsed_articles.find_all('docno') doc_texts = parsed_articles.find_all('text') self.num_doc = len(doc_nos) bar = ProgressBar() for i in bar(range(self.num_doc)): doc = Document() doc.add( Field('docno', doc_nos[i].string.strip(), StringField.TYPE_STORED)) doc.add( Field( 'content', re.sub('[^a-zA-Z0-9\n]', ' ', doc_texts[i].get_text()).lower(), TextField.TYPE_STORED)) self.writer.addDocument(doc) self.writer.close() def get_sorted_results(self, query): SHOULD = BooleanClause.Occur.SHOULD parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer) reader = IndexReader.open(self.directory) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) topDocs = searcher.search(parsed_query, 10) j = 0 for i in topDocs.scoreDocs: d = searcher.doc(i.doc) print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score) j += 1
def indexing(self): docs = self.text.load_seg_without_stopword_data() if (os.path.exists(self.index_dir)): r = input("Indexing Dir has existed! Continue indexing?") if (r.lower() != 'y'): return -1 if (not os.path.exists(self.index_dir)): os.makedirs(self.index_dir) store = SimpleFSDirectory(Paths.get(self.index_dir)) # todo # version.LUCENE_6_5_0 # analyzer = CJKAnalyzer(CharArraySet.EMPTY_SET) # analyzer =SmartChineseAnalyzer() # analyzer = StandardAnalyzer(Version.LUCENE_6_5_0) # index_writer = IndexWriter(store,analyzer,True,IndexWriter.MaxFieldLength(512)) config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) id_conf = FieldType() id_conf.setStored(True) id_conf.setTokenized(False) id_conf.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) date_conf = FieldType() date_conf.setStored(True) date_conf.setTokenized(True) date_conf.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) content_conf = FieldType() content_conf.setStored(True) content_conf.setTokenized(True) content_conf.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for n, i in enumerate(docs): document = Document() for key, content in i.items(): if (key == 'PubDate'): document.add(Field(key, content, date_conf)) else: document.add(Field(key, content, content_conf)) document.add(Field('id', str(n), id_conf)) writer.addDocument(document) if (n % 1000 == 0): print(n) writer.commit() writer.close()
class IndexBuilder(object): def __init__(self, index_path, update=False): dir = FSDirectory.open(Paths.get(index_path)) analyzer = StandardAnalyzer() iwc = IndexWriterConfig(analyzer) if update: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) else: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(dir, iwc) def index_docs(self, input_documents): for document in tqdm(input_documents, total=len(input_documents)): doc = Document() doc.add(StringField(".I", document[".I"].lower(), Field.Store.YES)) doc.add(StringField(".U", document[".U"].lower(), Field.Store.YES)) type = FieldType() type.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) type.setStored(True) type.setStoreTermVectors(True) type.setTokenized(True) if ".W" in document and ".M" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower() + document[".W"].lower())), type)) elif ".M" in document and ".W" not in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower())), type)) elif ".M" not in document and ".W" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".T"].lower() + document[".W"].lower())), type)) elif ".M" not in document and ".W" not in document: doc.add( Field("text", " ".join(tokenizer.tokenize(document[".T"].lower())), type)) if self.writer.getConfig().getOpenMode( ) == IndexWriterConfig.OpenMode.CREATE: self.writer.addDocument(doc) else: self.writer.updateDocument(Term(".U", document[".U"]), doc) self.writer.close()
def index(self): """ This function is used to index the preprocessed data. The inverted index will be saved to ./index/ folder business_id, name, address, categories, review and tip data are indexed. """ print "INDEXING..." lucene.initVM() indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) # each business indexed as a document for key, business in self.data.items(): doc = Document() text = "" doc.add( Field("id", business["business_id"], Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("name", business["name"], Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("address", business["full_address"], Field.Store.YES, Field.Index.ANALYZED)) cat_text = "\n".join(business["categories"]) doc.add( Field("category", cat_text, Field.Store.YES, Field.Index.ANALYZED)) # combine all reviews of a business together review_text = "" for review in business["review"]: review_text += review["text"] # combine all tip of a business together tip_text = "" for tip in business["tip"]: tip_text += tip["text"] # concatenate the data to be indexed and add it as one field text += business["name"] text += business["full_address"] text += cat_text text += review_text text += tip_text doc.add(Field("text", text, Field.Store.YES, Field.Index.ANALYZED)) # add the business doc to writer writer.addDocument(doc) writer.close()
def dummyIndex(self): """ Create a dummy index - to avoid problems updating it """ config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(self.indexDir, config) doc = Document() doc.add(Field('uid', 'dummy', StringField.TYPE_STORED)) writer.addDocument(doc) writer.commit() writer.close() return
def create_miniindex(docs): index_store = RAMDirectory() analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) for doc in docs: writer.addDocument(doc) writer.commit() writer.close() return index_store
class DocRepo: def __init__(self): # self.analyzer = StandardAnalyzer() # self.analyzer = PersianAnalyzer(StopFilter.makeStopSet(sw)) # self.analyzer = PersianAnalyzer() self.analyzer = StopAnalyzer(Paths.get(Config.stop_words_address)) self.config = IndexWriterConfig(self.analyzer) self.index = RAMDirectory() self.w = IndexWriter(self.index, self.config) def addDocument(self, id): global answers_train preA = answers_train[id] doc = Document() doc.add(TextField("pa", preA, Field.Store.YES)) doc.add(StringField("id", str(id), Field.Store.YES)) self.w.addDocument(doc) self.w.commit() def __del__(self): self.w.close() def get_most_similar(self, sentence, do_log=False): # print('query string is',string) # q = QueryParser('pa', self.analyzer).parse(sentence) query_builder = BooleanQuery.Builder() for token in sentence.split(' '): if token not in sw: qtq = TermQuery(Term("pa", token)) query_builder.add( BooleanClause(qtq, BooleanClause.Occur.SHOULD)) q = query_builder.build() hitsPerPage = 2 reader = DirectoryReader.open(self.w) self.searcher = IndexSearcher(reader) simi = BM25Similarity(Config.k1, Config.b) # simi = ClassicSimilarity() self.searcher.setSimilarity(simi) docs = self.searcher.search(q, hitsPerPage) hits = docs.scoreDocs # print("Found " + str(len(hits)) + " hits.") if len(hits) > 0: mate = self.searcher.doc(hits[0].doc).get("id") if do_log: print("found something. mate: ", mate, "- score : ", hits[0].score) return hits[0], int(mate) else: return None, -1
def testAdd(self, goodname, salenum, price, shopname, url, picturename, comment, historyprice): analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) # True,建立新索引,False,建立增量索引 noIndexedString = FieldType() noIndexedString.setTokenized(False) noIndexedString.setIndexed(False) noIndexedString.setStored(True) try: print "adding", goodname goodname_s = unicode(goodname, 'utf8') seg_list_good = jieba.cut(goodname_s, cut_all=False) goodname_s = " ".join(seg_list_good) # 默认模式 shopname_s = unicode(shopname, 'utf8') seg_list_shop = jieba.cut(shopname_s, cut_all=False) shopname_s = " ".join(seg_list_shop) # 默认模式 shopnameField = Field("shopName", shopname, noIndexedString) shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO) goodnameField = Field("goodName", goodname, noIndexedString) goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO) salenumField = IntField("saleNum", salenum, Field.Store.YES) priceField = DoubleField("price", price, Field.Store.YES) urlField = Field("url", url, noIndexedString) pictureField = StringField("pictureName", picturename, Field.Store.YES) commentField = Field("comments", comment, noIndexedString) historyPriceField = Field("historyPrice", historyprice, noIndexedString) doc = Document() doc.add(shopnameField) doc.add(shopnameField_s) doc.add(goodnameField) doc.add(goodnameField_s) doc.add(salenumField) doc.add(priceField) doc.add(urlField) doc.add(pictureField) doc.add(commentField) doc.add(historyPriceField) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def buildIndex(self, inputFile): analyzer = self.getAnalyzer() iwconf = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter( SimpleFSDirectory( File(self.luceneDir) ), iwconf) # read through input file and write out to lucene counter = 0 linesReadCounter = 0 with open(inputFile, 'r') as lines: linesRead = 0 for line in lines: try: linesRead+=1 if linesRead % 1000 == 0: print "%d lines read" % linesRead cui, concept = line.replace("\",\"", "\t").replace("\"", "").split("\t") concept = concept.strip() cui = cui.strip() strNorm = self.normalizeCasePunct(concept) strSorted = self.sortWords(strNorm) strStemmed = self.stemWords(strNorm) strStemmedSorted = self.stemWords(strSorted) fdoc = Document() counter +=1 fid = counter fdoc.add( Field("id", unicode(fid), Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("cui", cui, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str", concept, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_norm", strNorm, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_sorted", strSorted, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_stemmed", strStemmed, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_stemmedSorted", strStemmedSorted, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(fdoc) if fid % 1000 == 0: writer.commit() except: "Skipping line: %s" % line writer.commit() writer.close()
class QAindexer(object): INDEX_FILED_TITLE = "Title" INDEX_FILED_BODY = "Body" def __init__(self, dir, data_file): self.dir = dir self.data_file = data_file index_dir = FSDirectory.open(Paths.get(self.dir)) analyzer = StandardAnalyzer() writer_config = IndexWriterConfig(analyzer) writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(index_dir, writer_config) def get_document(self, title, body): doc = Document() doc.add(TextField(self.INDEX_FILED_TITLE, title, Field.Store.YES)) doc.add(TextField(self.INDEX_FILED_BODY, body, Field.Store.YES)) return doc def index_QAs(self): print "Start to index " + self.data_file + " @ " + str(datetime.now()) count = 0 for event, elem in ET.iterparse(self.data_file): if event == 'end': if elem.get('Title'): title = elem.get('Title') print title else: title = '' if elem.get('Body'): body = elem.get('Body') body = re.sub(r'</?\w+[^>]*>', '', body) print body else: body = '' doc = self.get_document(title.encode("utf-8"), body.encode("utf-8")) self.writer.addDocument(doc) print count count = count + 1 elem.clear() self.writer.close() print "Indexing " + self.data_file + " complete @ " + str( datetime.now())
def index(analyzer, index_dest_dir, documents): """ Builds Lucene index from provided documents using given analyzer :param analyzer: :param index_dest_dir: :param list[Document] documents: :return: """ if not all([isinstance(d, Document) for d in documents]): raise TypeError("documents should be iterable of type Document! Given: %s" % type(documents[0])) writer_config = IndexWriterConfig(Version.LUCENE_30, analyzer) writer = IndexWriter(SimpleFSDirectory(File(index_dest_dir)), writer_config) for doc in documents: writer.addDocument(doc) writer.close()
def IndexDocs(self, documents): """ Index documents under the directory :Parameters: - `documents`: Documents to be indexed (List) """ # Get the Writer Configuration writerConfig = IndexWriterConfig(self.__analyzer) # Get index writer writer = IndexWriter(self.__indexDir, writerConfig) for document in documents: # Create a document that would we added to the index doc = Document() # Add a field to this document doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES)) doc.add( Field(Indexer.CONTENT, document['content'], self.__contentType)) doc.add( StringField(Indexer.DATE, document['date'], Field.Store.YES)) doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES)) doc.add( TextField(Indexer.TAGS, self.__qualifyTags(document['tags']), Field.Store.YES)) doc.add( LongPoint(Indexer.TIMESTAMP, self.__getTimestamp(document['date']))) # Add or update the document to the index if not self.__boAppend: # New index, so we just add the document (no old document can be there): if self.__verbose: print("Adding " + document['name']) writer.addDocument(doc) else: # Existing index (an old copy of this document may have been indexed) so # we use updateDocument instead to replace the old one matching the exact # path, if present: if self.__verbose: print("Updating " + document['name']) writer.updateDocument(Term(Indexer.NAME, document['name']), doc) # Print index information and close writer print("Indexed %d documents (%d docs in index)" % (len(documents), writer.numDocs())) writer.close()
def xmlrpc_indexDocument(self, instance, id, text): """Index a new document.""" self.xmlrpc_unindexDocument(instance, id) # Create a document and add two fields to it. doc = Document() doc.add(Field('id', id, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field('text', text, Field.Store.YES, Field.Index.TOKENIZED)) doc.add(Field('instance', instance, Field.Store.YES, Field.Index.UN_TOKENIZED)) # Write the document into the index. writer = IndexWriter(self.indexPath, self.analyzer, 0) writer.addDocument(doc) writer.optimize() writer.close() log('Insert: Instance: %s Document: %s' %(instance, id)) return 1
def index_wiki(wiki_xmlfile, index_directory_name): # Initialize index directory and analyzer. version = Version.LUCENE_CURRENT store = FSDirectory.open(File(index_directory_name)) analyzer = StandardAnalyzer(version) # Creates config file. config = IndexWriterConfig(version, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) # Set document content field type. content_fieldtype = FieldType() content_fieldtype.setIndexed(True) content_fieldtype.setStored(True) content_fieldtype.setTokenized(True) content_fieldtype.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document title field type. title_fieldtype = FieldType() title_fieldtype.setIndexed(True) title_fieldtype.setStored(True) title_fieldtype.setTokenized(True) title_fieldtype.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document url field type. url_fieldtype = FieldType() url_fieldtype.setIndexed(True) url_fieldtype.setStored(True) url_fieldtype.setTokenized(False) url_fieldtype.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for xmldoc in wikicorpusxml((wiki_xmlfile)): content = xmldoc.partition('>')[2].partition('<')[0].strip() title = xmldoc.partition(' title="')[2].partition('"')[0].strip() url = xmldoc.partition(' url="')[2].partition('"')[0].strip() doc = Document() doc.add(Field("contents", content, content_fieldtype)) doc.add(Field("title", title, title_fieldtype)) doc.add(Field("url", url, url_fieldtype)) writer.addDocument(doc) writer.commit() writer.close()
def indexing(): print("建立索引,文本文件夹 [%s] ..." % TEXT_DIR) create_dir(INDEX_DIR) directory = SimpleFSDirectory(Paths.get(INDEX_DIR)) config = IndexWriterConfig(ANALYZER) writer = IndexWriter(directory, config) for x in glob.glob(os.path.join(TEXT_DIR, "*.txt")): title, post, terms = get_terms(x) doc = Document() if terms: doc.add(Field("title", title, TextField.TYPE_STORED)) doc.add(Field("post", post, TextField.TYPE_STORED)) doc.add(Field("terms", terms, TextField.TYPE_STORED)) writer.addDocument(doc) writer.commit() writer.close()
def index(analyzer, index_dest_dir, documents): """ Builds Lucene index from provided documents using given analyzer :param analyzer: :param index_dest_dir: :param list[Document] documents: :return: """ if not all([isinstance(d, Document) for d in documents]): raise TypeError( "documents should be iterable of type Document! Given: %s" % type(documents[0])) writer_config = IndexWriterConfig(Version.LUCENE_30, analyzer) writer = IndexWriter(SimpleFSDirectory(File(index_dest_dir)), writer_config) for doc in documents: writer.addDocument(doc) writer.close()
def index_wiki(wiki_xmlfile, index_directory_name): lucene.initVM() # Initialize index directory and analyzer. version = Version.LUCENE_CURRENT store = FSDirectory.open(File(index_directory_name)) analyzer = StandardAnalyzer(version) # Creates config file. config = IndexWriterConfig(version, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) # Set document content field type. content_fieldtype = FieldType() content_fieldtype.setIndexed(True) content_fieldtype.setStored(True) content_fieldtype.setTokenized(True) content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document title field type. title_fieldtype = FieldType() title_fieldtype.setIndexed(True) title_fieldtype.setStored(True) title_fieldtype.setTokenized(True) title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document url field type. url_fieldtype = FieldType() url_fieldtype.setIndexed(True) url_fieldtype.setStored(True) url_fieldtype.setTokenized(False) url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for xmldoc in wikicorpusxml((wiki_xmlfile)): content = xmldoc.partition('>')[2].partition('<')[0].strip() title = xmldoc.partition(' title="')[2].partition('"')[0].strip() url = xmldoc.partition(' url="')[2].partition('"')[0].strip() doc = Document() doc.add(Field("contents", content, content_fieldtype)) doc.add(Field("title", title, title_fieldtype)) doc.add(Field("url", url, url_fieldtype)) writer.addDocument(doc) writer.commit() writer.close()
class PyLuceneRetrievalSystem(object): def __init__(self): self.num_doc = 0 self.directory = RAMDirectory() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.writer = IndexWriter(self.directory, self.config) def parse_articles(self, file_name): raw_articles = open(file_name).read() parsed_articles = BeautifulSoup(raw_articles, 'lxml') doc_nos = parsed_articles.find_all('docno') doc_texts = parsed_articles.find_all('text') self.num_doc = len(doc_nos) bar = ProgressBar() for i in bar(range(self.num_doc)): doc = Document() doc.add(Field('docno', doc_nos[i].string.strip(), StringField.TYPE_STORED)) doc.add(Field('content', re.sub('[^a-zA-Z0-9\n]', ' ', doc_texts[i].get_text()).lower(), TextField.TYPE_STORED)) self.writer.addDocument(doc) self.writer.close() def get_sorted_results(self, query): SHOULD = BooleanClause.Occur.SHOULD parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer) reader = IndexReader.open(self.directory) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) topDocs = searcher.search(parsed_query, 10) j = 0 for i in topDocs.scoreDocs: d = searcher.doc(i.doc) print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score) j += 1
def index(self): """ This function is used to index the preprocessed data. The inverted index will be saved to ./index/ folder business_id, name, address, categories, review and tip data are indexed. """ print "INDEXING..." lucene.initVM() indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) # each business indexed as a document for key, business in self.data.items(): doc = Document() text = "" doc.add(Field("id", business["business_id"], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("name", business["name"], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("address", business["full_address"], Field.Store.YES, Field.Index.ANALYZED)) cat_text = "\n".join(business["categories"]) doc.add(Field("category", cat_text, Field.Store.YES, Field.Index.ANALYZED)) # combine all reviews of a business together review_text = "" for review in business["review"]: review_text += review["text"] # combine all tip of a business together tip_text = "" for tip in business["tip"]: tip_text += tip["text"] # concatenate the data to be indexed and add it as one field text += business["name"] text += business["full_address"] text += cat_text text += review_text text += tip_text doc.add(Field("text", text, Field.Store.YES, Field.Index.ANALYZED)) # add the business doc to writer writer.addDocument(doc) writer.close()
def updateDeleteRec(self, pid1, pid2, personDB, familyDB, relationDB): config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(self.indexDir, config) mt = matchtext() writer.deleteDocuments(Term('uid', pid1)) writer.deleteDocuments(Term('uid', pid2)) p = personDB.find_one({'_id': pid1}) matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB) doc = Document() doc.add(Field('uid',str(pid1), StringField.TYPE_STORED)) doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED)) doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED)) doc.add(Field("text", mt.luceneFix(self.personText(p)), TextField.TYPE_NOT_STORED)) writer.addDocument(doc) writer.commit() writer.close() self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir)) return
def update(self): delete(self._dir, self._counter) index_dir = SimpleFSDirectory(Paths.get(self._dir)) config = IndexWriterConfig(StandardAnalyzer()) index_writer = IndexWriter(index_dir, config) for key, val in self._data.items(): document = Document() document.add(Field('id', key, StringField.TYPE_STORED)) for k, v in val.items(): if v: if k == 'text': document.add(Field('text', v, TextField.TYPE_STORED)) else: document.add(Field(k, v, StringField.TYPE_STORED)) index_writer.addDocument(document) index_writer.commit() index_writer.close()
class Indexer: def __init__(self, writerConfig, indexDir): lucene.initVM() self.mIndexDir = SimpleFSDirectory(File(indexDir)) self.mConfig = writerConfig self.mWriter = IndexWriter(self.mIndexDir, self.mConfig) def index(self, root): t = FieldType() t.setIndexed(True) t.setStored(True) t.setTokenized(True) t.setStoreTermVectors(True) for path, dirs, files in os.walk(root): for file in files: filePath = os.path.join(path, file) fd = open(filePath) content = unicode(fd.read(), 'iso-8859-1') fd.close() doc = Document() doc.add(Field('name', file, StringField.TYPE_STORED)) parent = os.path.split(path)[1] doc.add(Field('parent', parent, StringField.TYPE_STORED)) if len(content) > 0: doc.add(Field('content', content, t)) print 'Indexing %s' % file self.mWriter.addDocument(doc) self.mWriter.commit() self.mWriter.close()
def lucene_indexing(): lucene.initVM() index_dir = os.getcwd() dir = SimpleFSDirectory(File(index_dir)) analyzer = StandardAnalyzer(Version.LUCENE_48) index_writer_config = IndexWriterConfig(Version.LUCENE_48, analyzer); index_writer = IndexWriter(dir, index_writer_config) for tfile in glob.glob(os.path.join(index_dir, '*.txt')): print "Indexing: ", tfile document = Document() with open(tfile, 'r') as f: content = f.read() document.add(Field("text", content, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("title", tfile, Field.Store.YES, Field.Index.ANALYZED)) index_writer.addDocument(document) print index_writer.numDocs() index_writer.close()
def index(self): # if exists sent_index, delete and create a new one doc_tool.cleardir(index_root) doc_tool.mkdir(index_root) index_dir = FSDirectory.open(Paths.get(index_root)) writer_config = IndexWriterConfig(StandardAnalyzer()) writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_dir, writer_config) ft1 = FieldType() ft1.setStored(True) ft1.setIndexOptions(IndexOptions.NONE) ft2 = FieldType() ft2.setStored(False) ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc_list = self.doc() file_path = os.path.join(config.SENT_RETRIEVAL_ROOT, "merge_doc") file_list = os.listdir(file_path) num = 0 for file in file_list: docs = doc_tool.load_json_file(file_path, file) for page_identifier in docs: if page_identifier in doc_list: num += 1 for sent_number in docs[page_identifier]: sentence_text = self.process_sent( docs[page_identifier][sent_number]) doc = Document() doc.add(Field("page_identifier", page_identifier, ft1)) doc.add(Field("sentence_number", sent_number, ft1)) doc.add(Field("sentence_text", sentence_text, ft2)) writer.addDocument(doc) print(num) writer.commit() writer.close() index_dir.close()
def __index(self, emailInfo): from org.apache.lucene.index import IndexWriterConfig from org.apache.lucene.util import Version from org.apache.lucene.analysis.standard import StandardAnalyzer analyser = StandardAnalyzer(Version.LUCENE_33) conf = IndexWriterConfig(Version.LUCENE_33, analyser) from org.apache.lucene.store import FSDirectory from java.io import File storage = File.createTempFile(u'Tubelight-', '.index') storage.delete() storage.mkdir() storage.deleteOnExit() self.storage = storage.getAbsolutePath() from java.io import File self.session.setAttribute('directory', storage.getAbsolutePath()+File.separator+'mail.idx') directory = FSDirectory.open(storage) from org.apache.lucene.index import IndexWriter iw = IndexWriter(directory, conf) from us.d8u.tubelight import Configuration addr = emailInfo[Configuration.EmailAddressKey] (username, server) = addr.split('@') from java.lang import System System.setProperty("mail.imap.partialfetch", "false") urlPrefix = (("imap://%s@%s:%d/Inbox") % (username, server, int(emailInfo[Configuration.EmailPortKey]))) from javax.mail import Session session = Session.getDefaultInstance(System.getProperties(), None).getStore(h.get(Configuration.EmailProtocolKey)) session.connect(server, username, emailInfo[Configuration.EmailPasswordKey]) folder = session.getDefaultFolder() for m in folder.getMessages(): from org.apache.lucene.document import Document d = Document() subject = Field("subject", m.getSubject(), Field.Store.YES, Field.Index.ANALYZED) toSrc = u'' toSrc = [((u'%s, %s') % (toSrc, str(r))) for r in m.getAllRecipients()] to = Field("to", toSrc.substring(toSrc.indexOf(u',')), Field.Store.YES, Field.Index.ANALYZED) d.add(to) d.add(subject) iw.addDocument(d) iw.commit() self.searcher = IndexSearcher(directory)
def index (cls, indexDir, taxoDir, facets_config): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer config = IndexWriterConfig(Version.LUCENE_48, WhitespaceAnalyzer(Version.LUCENE_48)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # obtain the sample facets for current document facets = categories[docNum] author = authors[docNum] # ... and use the FacetField class for adding facet fields to # the Lucene document (and via FacetsConfig to the taxonomy index) doc.add(FacetField("Author", [author])); for f in facets: doc.add(FacetField("Categories", f)) # finally add the document to the index iw.addDocument(facets_config.build(taxo, doc)); nDocsAdded += 1 # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. iw.close() taxo.close() print "Indexed %d documents with facets." % nDocsAdded
def index(self, personDB, familyDB, relationDB): """ indexes a database Field match includes information about parents and is used to find matches Field text has Ids, names, places, and dates and is used to find a person/family """ config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(self.indexDir, config) #indexWriter.setRAMBufferSizeMB(256) #? mt = matchtext() for p in personDB.find({}, no_cursor_timeout=True): matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB) doc = Document() doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED)) doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED)) doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED)) doc.add(Field("text", mt.luceneFix(self.personText(p)), TextField.TYPE_NOT_STORED)) writer.addDocument(doc) #Family matchtext for f in familyDB.find(): #matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB) doc = Document() doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED)) #doc.add(Field('sex','FAM', StringField.TYPE_STORED)) #doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED)) txt = f['_id'] if 'refId' in f: txt += ' ' + f['refId'] doc.add(Field("text", txt, TextField.TYPE_NOT_STORED)) writer.addDocument(doc) writer.commit() writer.close() return
def createind(product,url): "This function creates index for lucene" global counter counter += 1 adId = counter adLine = product field_string = chunker(product.lower()) field_related_words = getDbpediaMatches(product, field_string) url = url lucene.initVM() # 1. create an index index_path = File("Home/WishMatcherIndex") analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) index = SimpleFSDirectory(index_path) # 2. fill the index config = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(index, config) #for title in TITLES: import time millis = int(round(time.time() * 1000)) userid = str(millis) doc = Document() doc.add(Field("AdId", str(adId), Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("AdLine", adLine, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("FieldString", field_string, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("FieldRelatedWords", field_related_words, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("URL", url, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print(adId) # 3. close resources writer.close() index.close() return ""
def create_index(index) : indexDir = SimpleFSDirectory(File(index)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open('f:/nlp/data/questions/combine.txt') for line in f : line = get_data_from_text(line.decode('utf-8')) doc = Document() field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED) field.setBoost(2.0) doc.add(field) writer.addDocument(doc) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
# docList = list() #create a list to store all strings in this document docName = document.strip('\n') docPath = '/home/650/resources/Homework1/cranfieldDocs' tree = ET.parse(docPath + '/' + docName) root = tree.getroot() docId = root.find('DOCNO').text title = root.find('TITLE').text author = root.find('AUTHOR').text biblio = root.find('BIBLIO').text text = root.find('TEXT').text # print docId, title #create a file files = Document() files.add(Field("DOCNO", docId, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(files) files.add(Field("TITLE", title, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(files) files.add(Field("AUTHOR", author, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(files) files.add(Field("BIBLIO", biblio, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(files) files.add(Field("TEXT", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(files) # print "Indexed %d lines from stdin (%d docs in index)" % (n, writer.numDocs()) # print "Closing index of %d docs..." % writer.numDocs() writer.close()
indexDir = SimpleFSDirectory(File("lucene/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from sys.stdin..." header=[] for n, l in enumerate(sys.stdin): doc = Document() fields = l.rstrip().split("\t") for (idx,field) in enumerate(fields): if n == 0: typechar = field[-1] if typechar not in set(['t','s','i']): sys.stderr.write("unexpected type char in last character position of header field: %s\n" % (field)) exit(-1) header.append([field,LUCENE_TYPES[typechar]]) else: (fname,fieldtype) = header[idx] if fieldtype is IntField: #sys.stdout.write("int field %s:%s\n" % (fname,field)) field = int(field) doc.add(fieldtype(fname, field, Field.Store.YES)) #Field.Store.YES, Field.Index.ANALYZED)) #doc.add(Field(fieldtype, field, Field.Store.YES, Field.Index.ANALYZED)) #doc.add(fieldtype(header[idx][1],field,Field.Store.YES) writer.addDocument(doc) print "Indexed %d lines from stdin (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
class IndexingEngine(): def __init__(self): self.mDocumentDirectory = settings.ADMINS_ENGINE.mDocumentDirectory self.mIndexDirectory = settings.ADMINS_ENGINE.mIndexDirectory self.mAnalyzers = settings.ADMINS_ENGINE.getIndexingAnalyzers() ############################# Writer Configurattion ##################################### map = HashMap() map.put('name', self.mAnalyzers['name']) map.put('parent', self.mAnalyzers['parent']) map.put('content', self.mAnalyzers['default']) map.put('id', self.mAnalyzers['id']) analyzerWrapper = PerFieldAnalyzerWrapper(self.mAnalyzers['default'], map) self.mWriterConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzerWrapper) self.mWriterConfig.setOpenMode(settings.ADMINS_ENGINE.mOpenMode) if settings.ADMINS_ENGINE.mSimilarity != None: self.mWriterConfig.setSimilarity(settings.ADMINS_ENGINE.mSimilarity) ######################################################################################## directory = SimpleFSDirectory(File(self.mIndexDirectory)) self.mIndexWriter = IndexWriter(directory, self.mWriterConfig) ############################# FieldType Prepration ##################### nameField = FieldType() nameField.setIndexed(True) nameField.setStored(True) nameField.setTokenized(True) nameField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) parentField = FieldType() parentField.setIndexed(True) parentField.setStored(True) parentField.setTokenized(True) parentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) contentField = FieldType() contentField.setIndexed(True) contentField.setStored(True) contentField.setTokenized(True) contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) idField = FieldType() idField.setIndexed(True) idField.setStored(True) idField.setTokenized(False) idField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) self.mFieldTypes = { 'name' : nameField, 'parent' : parentField, 'content' : contentField, 'id' : idField } ####################################################################### self.mLog = "" def indexing(self, root = settings.ADMINS_ENGINE.mDocumentDirectory, parent = [], docID = 1, parentID = 0, id = 0): realPath = os.path.abspath(root) for i in os.listdir(realPath): path = os.path.join(realPath, i) if os.path.isfile(path): #index this file doc = Document() doc.add(Field('name', ("%s %s" % (' '.join(parent), i)).strip(), self.mFieldTypes['name'])) doc.add(Field('parent', ' '.join(parent), self.mFieldTypes['parent'])) doc.add(Field('id', str(docID), self.mFieldTypes['id'])) doc.add(Field('parentID', str(parentID), self.mFieldTypes['id'])) fd = open(path, 'r') content = fd.read() fd.close() if len(content) > 0: doc.add(Field('content', content, self.mFieldTypes['content'])) self.mIndexWriter.addDocument(doc) ##################### Logging ############################## if IS_DEBUG: nameDebug = AnalyzerDebug.debug(self.mAnalyzers['name'], ("%s %s" % (' '.join(parent), i)).strip()) parentDebug = AnalyzerDebug.debug(self.mAnalyzers['parent'], ' '.join(parent)) contentDebug = AnalyzerDebug.debug(self.mAnalyzers['default'], content) self.mLog = self.mLog + ( "File %s\n {name - %s}: %s\n {parent - %s}: %s\n {content}: %s\n\n" % (path, docID, nameDebug, parentID, parentDebug, contentDebug) ) docID = docID + 1 ################### index sub commands if os.path.isdir(path + ".sub"): parent.append(i) docID = self.indexing(path + ".sub", parent, docID, docID - 1, id + 1) parent.pop() if id == 0: self.mIndexWriter.commit() self.mIndexWriter.close() if IS_DEBUG: loggingBot = LoggingBot(self.mLog, settings.ADMINS_ENGINE.getIndexingLogQueue()) loggingBot.start() self.mLog = "" return docID
def main(indexDir, inputDir): """Creates a Lucene Index, and indexes every .json file it finds. It utilizes a stopwords.txt to filter out stop words""" lucene.initVM() logger.info("Loading stop words from stopwords.txt") f = open('stopwords.txt', 'r') stopwords = set([]) for line in f: stopwords.add(line.strip()) f.close() logger.debug('Stop words: %s' % str(stopwords)) temp = CharArraySet(Version.LUCENE_CURRENT, 1, True) for stopword in stopwords: temp.add(stopword) stopwords = temp # Create index logger.info("Creating Lucene index [%s]..." % indexDir) dir = SimpleFSDirectory(File(indexDir)) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopwords) writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(dir, writerConfig) logger.info("Currently there are %d documents in the index..." % writer.numDocs()) # Index documents onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ] for f in onlyfiles: try: journal_code = f.split('.')[0] f = join(inputDir, f) json_data = open(f) data = json.load(json_data) for entry in data: doc = Document() doc.add(Field("journal", journal_code, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", entry['url'], Field.Store.YES, Field.Index.NOT_ANALYZED )) doc.add(Field("date", entry['date'], Field.Store.YES, Field.Index.NOT_ANALYZED )) doc.add(Field("title", entry['title'], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) json_data.close() except (IOError) as v: try: (code, message) = v except: code = 0 message = v logger.error("I/O Error: " + str(message) + " (" + str(code) + ")") logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs()) # Wrap it up #logger.info("About to optimize index of %d documents..." % writer.numDocs()) #writer.optimize() #logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() reader = IndexReader.open(dir) with open('all.csv', 'wb') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for i in xrange(0, reader.numDocs()): doc = reader.document(i) csvwriter.writerow([doc.get('journal'), doc.get('date'), doc.get('url').encode('utf8'), \ doc.get('title').strip().replace(',', '\,').encode('utf8')])
def store(primary_keys_map,to_be_compressed_input,collection_name,data,commit=False): INDEX_DIR_DEFAULT="IndexFiles.index" if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT print "started indexing input data......" #extracting values try: contents=json.loads(data) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #checking for existance of record with same primary_key set try: ireader=IndexReader.open(direc) searcher=IndexSearcher(ireader) query=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(contents[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query,MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 except: pass #setting writer configurations config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) #fix this later.....FieldType not defined #field_type=FieldType() #field_type.setIndexed(True) #field_type.setStored(False) #field_type.setTokenized(False) try: doc=Document() #index files wrt primary key for primary_key in primary_keys_map: try: field=Field(primary_key,contents[primary_key],Field.Store.NO,Field.Index.ANALYZED) doc.add(field) except: # primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_input==True: data=snappy.compress(data) field=Field("$DATA$",data,Field.Store.YES,Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) if commit==True: writer.commit() writer.close() return 000 except: return 102