def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def __init__(self): self.__dict__ = self.__shared_state if not self.__shared_state: self.jccvm = lucene.initVM() self.index = SimpleFSDirectory( lucene.File(settings.lucene_index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
def main1(): print "retrieve and display files......" direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) searcher = lucene.IndexSearcher(direc) search(searcher, analyzer) search2(searcher, analyzer)
def __init__(self): smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) self.analyzers = {"smartcn": smartcn} directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR)) self.searcher = lucene.IndexSearcher(directory, True) self.pgconn = mypass.getConn() self.sw = sinaweibooauth.SinaWeiboOauth()
def __init__(self, index_dir): ''' Initialises index parameters ''' lucene.initVM() self.index_dir = index_dir if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) store = lucene.SimpleFSDirectory(lucene.File(self.index_dir)) self.analyser = PorterStemmerAnalyzer() self.writer = lucene.IndexWriter( store, self.analyser, True, lucene.IndexWriter.MaxFieldLength.LIMITED) self.writer.setMaxFieldLength(1048576) directory = lucene.SimpleFSDirectory(lucene.File(self.index_dir)) self.reader = lucene.FilterIndexReader.open(directory, True)
def search(self, restrictions, destination): """ @see: L{NullPrincipalSearcher<datafinder.persistence.search.searcher.NullSearcher>} E1101: Pylint cannot detect the internals of the modules solr and lucene. """ # pylint: disable=E1101 results = list() queryString = search_restriction_mapping.mapSearchRestriction( restrictions) if self._configuration.luceneIndexUri.startswith("file:///"): try: self._configuration.env.attachCurrentThread() indexDir = lucene.SimpleFSDirectory( lucene.File( self._configuration.luceneIndexUri.replace( "file:///", ""))) analyzer = lucene.StandardAnalyzer( lucene.Version.LUCENE_CURRENT) searcher = lucene.IndexSearcher(indexDir) query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, "content", analyzer).parse(queryString) hits = searcher.search(query, constants.MAX_RESULTS) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) results.append("/%s" % urllib.unquote( doc.get(constants.FILEPATH_FIELD).encode("utf-8"))) searcher.close() except Exception, error: errorMessage = "Cannot search items. Reason: '%s'" % error raise PersistenceError(errorMessage)
def __init__(self, forumname): if not forumname in self.supported_forums: sys.exit() else: self.forum = forumname self.STORE_DIR = self.STORE_BASE_DIR + forumname smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) self.analyzers = {"smartcn": smartcn} directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR)) self.searcher = lucene.IndexSearcher(directory, True) self.pgconn = mypass.getConn()
def __init__(self, network): self.network = network smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33) analyzers = {"smartcn": smartcn} self.pgconn = mypass.getConn() writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"]) writerconfig.setWriteLockTimeout(600000L) writerconfig.setMaxThreadStates(50) writerconfig.setRAMBufferSizeMB(128.0) self.storeDir = self.storeDirBase + self.network store = lucene.SimpleFSDirectory(lucene.File(self.storeDir)) self.writer = lucene.IndexWriter(store, writerconfig)
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = lucene.SimpleFSDirectory(lucene.File(storeDir)) writer = lucene.IndexWriter(store, analyzer, True, lucene.IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) self.indexDocs(root, writer) ticker = Ticker() print 'optimizing index', threading.Thread(target=ticker.run).start() writer.optimize() writer.close() ticker.tick = False print 'done'
def __init__(self): smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33) analyzers = { "smartcn": smartcn } self.pgconn = mypass.getConn() self.sw = sinaweibooauth.SinaWeiboOauth() if not os.path.exists(self.storeDir): os.mkdir(self.storeDir) store = lucene.SimpleFSDirectory(lucene.File(self.storeDir)) writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"]) writerconfig.setWriteLockTimeout(600000L) writerconfig.setMaxThreadStates(50) writerconfig.setRAMBufferSizeMB(128.0) self.writer = lucene.IndexWriter(store, writerconfig)
def func_pic(command): global vm_env vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR="graphIndex" directory = lucene.SimpleFSDirectory(lucene.File(STORE_DIR)) searcher = lucene.IndexSearcher(directory, True) analyzer = lucene.SimpleAnalyzer(lucene.Version.LUCENE_CURRENT) title = [] url = [] imgurl = [] score = [] resultInfo, title, url, imgurl, score = run(command, searcher, analyzer) searcher.close() return resultInfo, title, url, imgurl, score
def search(request, template_name='reviews/search.html', local_site_name=None): """ Searches review requests on Review Board based on a query string. """ query = request.GET.get('q', '') siteconfig = SiteConfiguration.objects.get_current() if not siteconfig.get("search_enable"): # FIXME: show something useful raise Http404 if not query: # FIXME: I'm not super thrilled with this return HttpResponseRedirect(reverse("root")) if query.isdigit(): query_review_request = get_object_or_none(ReviewRequest, pk=query) if query_review_request: return HttpResponseRedirect(query_review_request.get_absolute_url()) import lucene lv = [int(x) for x in lucene.VERSION.split('.')] lucene_is_2x = lv[0] == 2 and lv[1] < 9 lucene_is_3x = lv[0] == 3 or (lv[0] == 2 and lv[1] == 9) # We may have already initialized lucene try: lucene.initVM(lucene.CLASSPATH) except ValueError: pass index_file = siteconfig.get("search_index_file") if lucene_is_2x: store = lucene.FSDirectory.getDirectory(index_file, False) elif lucene_is_3x: store = lucene.FSDirectory.open(lucene.File(index_file)) else: assert False try: searcher = lucene.IndexSearcher(store) except lucene.JavaError, e: # FIXME: show a useful error raise e
def index_files(board, time_delta): store = lucene.SimpleFSDirectory( lucene.File(BOARDSPATH + board + '/' + RECENT_INDEX)) writer = lucene.IndexWriter( store, lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) # writer.setMaxFieldLength(1048576) # 1MB flist = get_all_files(board, time_delta) for filename, owner, title in flist: path = BOARDSPATH + board + '/' + filename if not os.path.exists(path): continue f = open(path, 'r') contents = filter_file(f) debug(contents) try: title = title.decode('gbk') owner = owner.decode('gbk') contents = unicode(contents, 'gbk') except UnicodeDecodeError: f.close() debug(filename) continue f.close() if len(contents) > 0: doc = lucene.Document() doc.add( lucene.Field("name", filename, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("owner", owner, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) writer.addDocument(doc) debug('adding ' + filename) writer.optimize() writer.close()
def main1(): print "started indexing sample files......" direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) config = lucene.IndexWriterConfig(lucene.Version.LUCENE_CURRENT, analyzer) config.setOpenMode(lucene.IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = lucene.IndexWriter(direc, config) #fix this later.....FieldType not defined #field_type=lucene.FieldType() #field_type.setIndexed(True) #field_type.setStored(False) #field_type.setTokenized(False) file1 = open("nitin.json") data = file1.read() contents = json.loads(data) doc = lucene.Document() field = lucene.Field("name", contents['name'], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) doc.add(field) field = lucene.Field("data", data, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) file1.close() file1 = open("nitin2.json") data = file1.read() contents = json.loads(data) doc = lucene.Document() field = lucene.Field("name", contents['name'], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) doc.add(field) field = lucene.Field("data", data, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) file1.close() writer.optimize() print "Indexed and optimized %d documents" % writer.numDocs() writer.close()
def __init__(self, root, storeDir, analyzer, startDate, endDate): if not os.path.exists(storeDir): os.mkdir(storeDir) store = lucene.SimpleFSDirectory(lucene.File(storeDir)) # 创建IndexWriter对象,第一个参数是Directory,第二个是分词器, # 第三个表示是否是创建,如果为false为在此基础上面修改, # 第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,一般用IndexWriter.MaxFieldLength.LIMITED writer = lucene.IndexWriter(store, analyzer, False, lucene.IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) self.indexDocs(root, writer, startDate, endDate) ticker = Ticker() print 'optimizing index', threading.Thread(target=ticker.run).start() writer.optimize() writer.close() ticker.tick = False print 'done'
def Index(): field_list, conn, _config_dict = _InitIndexer() indexDir = _config_dict['indexDir'] if not os.path.exists(indexDir): os.mkdir(indexDir) store = SimpleFSDirectory(lucene.File(indexDir)) #print store writer = IndexWriter(store, SmartChineseAnalyzer(lucene.Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) try: ticker = Ticker() ticker.start() _IndexDocs(writer, field_list, conn) ticker.end() ticker.TimeCost() except Exception, e: print "Failed in Indexing...", e traceback.print_exc()
def search(self, query, field="content", limit=None): ''' Searches the index based on the query supplied. ''' directory = lucene.SimpleFSDirectory(lucene.File(self.index_dir)) searcher = lucene.IndexSearcher(directory, True) query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, field, self.analyser).parse(query) try: #if there's no limit then use a collector to retrieve them all if limit is None: collector = DocumentHitCollector(searcher) scoreDocs = searcher.search(query, collector) results = collector.get_collected_documents() else: scoreDocs = searcher.search(query, limit).scoreDocs results = [] for scoreDoc in scoreDocs: results.append(searcher.doc(scoreDoc.doc)) except lucene.JavaError, e: print e
def handle_noargs(self, **options): siteconfig = SiteConfiguration.objects.get_current() # Refuse to do anything if they haven't turned on search. if not siteconfig.get("search_enable"): sys.stderr.write('Search is currently disabled. It must be ' 'enabled in the Review Board administration ' 'settings to run this command.\n') sys.exit(1) if not have_lucene: sys.stderr.write('PyLucene is required to build the search index.\n') sys.exit(1) incremental = options.get('incremental', True) store_dir = siteconfig.get("search_index_file") if not os.path.exists(store_dir): os.mkdir(store_dir) timestamp_file = os.path.join(store_dir, 'timestamp') timestamp = 0 if incremental: try: f = open(timestamp_file, 'r') timestamp = datetime.utcfromtimestamp(int(f.read())) f.close() except IOError: incremental = False f = open(timestamp_file, 'w') f.write('%d' % time.time()) f.close() if lucene_is_2x: store = lucene.FSDirectory.getDirectory(store_dir, False) writer = lucene.IndexWriter(store, False, lucene.StandardAnalyzer(), not incremental) elif lucene_is_3x: store = lucene.FSDirectory.open(lucene.File(store_dir)) writer = lucene.IndexWriter(store, lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), not incremental, lucene.IndexWriter.MaxFieldLength.LIMITED) else: assert False status = Q(status='P') | Q(status='S') objects = ReviewRequest.objects.filter(status) if incremental: query = Q(last_updated__gt=timestamp) # FIXME: re-index based on reviews once reviews are indexed. I # tried ORing this in, but it doesn't seem to work. # Q(review__timestamp__gt=timestamp) objects = objects.filter(query) if sys.stdout.isatty(): print 'Creating Review Request Index' totalobjs = objects.count() i = 0 prev_pct = -1 for request in objects: try: # Remove the old documents from the index if incremental: writer.deleteDocuments(lucene.Term('id', str(request.id))) self.index_review_request(writer, request) if sys.stdout.isatty(): i += 1 pct = (i * 100 / totalobjs) if pct != prev_pct: sys.stdout.write(" [%s%%]\r" % pct) sys.stdout.flush() prev_pct = pct except Exception, e: sys.stderr.write('Error indexing ReviewRequest #%d: %s\n' % \ (request.id, e))
def index_ontology_files(oboFile, outDir, xref_map): """ Iterates over our list of ontology files and creates an index for each file. """ lucene.initVM() analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) # Handle a little bit of lucene setup filename, _ext = os.path.splitext(os.path.basename(oboFile)) indexDir = os.path.join(outDir, filename) if os.path.exists(indexDir): raise ExistingIndexDirectoryException( 'Error, attempted to index same file twice or index two files named the same' ) dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = lucene.IndexWriter(dir, analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) for term in oboparser.parse(oboFile, ['is_a']): if term.obsolete: continue doc = lucene.Document() add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 4.0) # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could # query hits that we would not want to occur thus errantly increasing the score of the field. # We will strip out these hyperlinks and index just the text. add_field_to_document(doc, "definition", strip_urls_from_text(term.definition), lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 0.4) # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists # in our Ontology object and need to be entered in one at a time add_fields_to_document(doc, "synonym", [x[0] for x in term.synonyms if x], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED, 0.7) add_fields_to_document(doc, "alt_id", term.alternateIds, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "xref", [replace_xref_identifier(x, xref_map) for x in term.xrefs], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "relationship", [" ".join(list(x)) for x in list(term.relationships)], lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED) add_fields_to_document(doc, "subset", term.subsets, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) writer.addDocument(doc) writer.optimize() writer.close()
#!/usr/bin/env python # -*- coding: utf-8 -*- import lucene, sys, os import traceback path = os.getcwd() lucene.initVM() dict = lucene.File("./myindex") directory = lucene.FSDirectory.open(dict) sp = lucene.SpellChecker(directory) dictionary = lucene.File("%s/pipimovieUTF8.txt" % path) sp.indexDictionary(lucene.PlainTextDictionary(dictionary)) suggestions = sp.suggestSimilar("天汽预报", 2) for item in suggestions: print item
import lucene import sys sys.path.append("..") import util from util.rake import Rake print("load vm") index_dir = '../../data/index/' #搭配地址 location_dir = '../../data/location/' lucene.initVM() directory = lucene.SimpleFSDirectory(lucene.File(index_dir)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) directory1 = lucene.SimpleFSDirectory(lucene.File(location_dir)) analyzer1 = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) rake = Rake("../../data/SmartStoplist.txt") def search(word): print("searching ") vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() searcher = lucene.IndexSearcher(directory, True)
# -*_ coding: utf-8 -*- # from lucene import * import lucene text = ["a b c d" , "c d e e"] texts = ["Python 是 一个 很有 吸引力 的 语言", "C++ 语言 也 很 有 吸引力 , 长久 不衰", "我们 希望 Python 和 C++ 高手加入", "我们 的 技术 巨牛 ,人人 都是 高手"] initVM() INDEX_DIR = '/root/weibo_corpus/post_index' directory = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) analyzer = SimpleAnalyzer() def read(filename): text = [] with open(filename,'r') as f: count = 0 for line in f: text.append(line.strip()) count = count + 1 if(count%10000==1): print(count) return text def search(searcher,qtext):
doc.get("title").encode('gbk') ]) # sort result results.sort(lambda x, y: cmp(x[0], y[0])) for name, owner, title in results: print name, owner, title def test_fixture(): global BOARDSPATH BOARDSPATH = './' if __name__ == '__main__': #test_fixture() board = sys.argv[1] querystr = sys.argv[2].decode('gbk').strip() lucene.initVM() path = BOARDSPATH + board + '/' + RECENT_INDEX if not os.path.exists(path) or len(querystr) == 0: sys.exit(-1) directory = lucene.SimpleFSDirectory(lucene.File(path)) searcher = IndexSearcher(directory) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) run(searcher, analyzer, querystr) searcher.close()
def __init__(self, storeDir): lucene.initVM() print 'lucene', lucene.VERSION self.dir = lucene.SimpleFSDirectory(lucene.File(storeDir))
#OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, #SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT #LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, #DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY #THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT #(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE #OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ Creates a sample Lucene index for the full-text search feature. """ import lucene import sys if __name__ == "__main__": lucene.initVM() indexDir = "D:/Downloads/index" dir_ = lucene.SimpleFSDirectory(lucene.File(indexDir)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = lucene.IndexWriter(dir_, analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) print("Currently there are %d documents in the index..." % writer.numDocs()) content = ( "Strategische Konzeption, Umsetzung und Betreuung von langfristig " + "hochwirksamen und messbar erfolgreichen Maßnahmen in Social Media.") doc = lucene.Document() doc.add( lucene.Field("content", content, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(
#!/usr/bin/python #coding: utf-8 #建索引的文件 import lucene import csv index_dir = '../../data/index/' data_dir = '../../data/corpus.csv' lucene.initVM() directory = lucene.SimpleFSDirectory(lucene.File(index_dir)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) def build_index(): f = open(data_dir) reader = csv.reader(f) print("开始创建索引") indx = 0 writer = lucene.IndexWriter(directory, analyzer, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) for line in reader: eng, zh = line[0], line[1] doc = lucene.Document()
def __init__(self, dir_file_path): lucene.initVM() self.directory = lucene.SimpleFSDirectory(lucene.File(dir_file_path)) self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_30) self.search = lucene.IndexSearcher(self.directory)