def __init__(self, writerConfig, indexDir): lucene.initVM() self.mIndexDir = SimpleFSDirectory(File(indexDir)) self.mConfig = writerConfig self.mWriter = IndexWriter(self.mIndexDir, self.mConfig)
def index(indexdir): lucene.initVM() indexDir = SimpleFSDirectory(File(indexdir)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer()) writer = IndexWriter(indexDir, writerConfig) f = open('data/docid.documento-xml.txt') st = PorterStemmer() for i, line in enumerate(f.readlines()): id, xmltext = line.split('\t') xmltext = xmltext.rstrip('\n') xmldoc = minidom.parseString(xmltext) title = xmldoc.getElementsByTagName("TITLE") title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue authors = xmldoc.getElementsByTagName("AUTHORS") authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue abstract = xmldoc.getElementsByTagName("ABSTRACT") abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue doc = Document() doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) print "indexed %s docs" % (i+1) writer.close()
def configure_lucene(): f = open('clique.txt','r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t','') line = line.replace('\r','') line = line.replace('\n','') line = line.replace('^','') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def index( self ): lucene.initVM() indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) ) analyzer = StandardAnalyzer( Version.LUCENE_30 ) index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) ) # read input files (.xml) for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ): corpus = codecs.open( in_file, encoding='utf-8' ).read() d = pq( corpus, parser='html' ) for text in d( 'Article' ).items(): document = Document() # find ID art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' ) # find Title art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) ) # find Abstract art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) ) # find Keyword art_keyword = text.find( 'Keyword' ).html().encode('utf-8') # find Content art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) ) # find Authors art_authors = text.find( 'Authors' ).html().encode('utf-8') document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\ Field.Store.YES,\ Field.Index.ANALYZED ) ) index_writer.addDocument( document ) index_writer.optimize() index_writer.close()
def run(self): print "Booting lucene driver worker...." lucene.initVM() self.fieldType1 = FieldType() self.fieldType1.setIndexed(True) self.fieldType1.setStored(False) self.fieldType1.setTokenized(True) self.fieldType2 = FieldType() self.fieldType2.setIndexed(True) self.fieldType2.setStored(True) self.fieldType2.setTokenized(False) while(True): data = self.queue.get() da = data[1] response = None try: self.fil = File(da['data']['indexdir']) self.d = NIOFSDirectory(self.fil) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.conf = IndexWriterConfig( Version.LUCENE_CURRENT, self.analyzer) response = getattr(self, da['action'])(da['data']) self.d.close() except Exception as e: print e if response is None: response = {} self.ret[data[0]] = response
def names(): lst = [] search = "spax"#request.form['product'] lucene.initVM() dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(search) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: if hit.score >= 1: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") items = doc.get("text").encode("utf-8").split(',') for item in items: if item == search: pass elif item not in lst: lst.append(item) #print lst data = {"products": lst} if request.method == 'POST': return jsonify(data) else: return jsonify(data)
def retrieve( self, query, max_res = 10 ): lucene.initVM() inDir = SimpleFSDirectory( File( self.INDEX_DIR ) ) lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 ) lucene_searcher = IndexSearcher( inDir ) my_query = QueryParser( Version.LUCENE_30, 'content' , lucene_analyzer ).parse( query ) MAX = max_res total_hits = lucene_searcher.search( my_query, MAX ) res_head = '{"query":"' + query + '","results":[' res_tail = ']}' result = res_head hits = total_hits.totalHits if ( hits > 0 ): res_body = '' it = 0 for hit in total_hits.scoreDocs: it += 1 doc = lucene_searcher.doc( hit.doc ) res_body += '{"rank":' +\ str( it ) +\ ',"score":"' +\ str( hit.score ) +\ '","title":"' +\ doc.get( 'title' ).encode('utf-8') +\ '","id":"' +\ doc.get( 'id' ).encode('utf-8') +\ '"}' if ( it < hits ): res_body += ',' result += res_body result += res_tail return result
def document( self, docId, max_res = 1 ): lucene.initVM() inDir = SimpleFSDirectory( File( self.INDEX_DIR ) ) lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 ) lucene_searcher = IndexSearcher( inDir ) my_query = QueryParser( Version.LUCENE_30, 'id' , lucene_analyzer ).parse( docId ) MAX = max_res total_hits = lucene_searcher.search( my_query, MAX ) result = '{' hits = total_hits.totalHits if ( hits == 1 ): for hit in total_hits.scoreDocs: doc = lucene_searcher.doc( hit.doc ) result += '"id":"' +\ doc.get( 'id' ) +\ '","title":"' +\ doc.get( 'title' ) +\ '","abstract":"' +\ doc.get( 'abstract' ) +\ '","keyword":"' +\ doc.get( 'keyword' ) +\ '","content":"' +\ doc.get( 'content' ) +\ '","authors":"' +\ doc.get( 'authors' ) +\ '"' result += '}' return result
def initial_searcher(): lucene.initVM() indexDir = INDEX_DIR dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) return searcher, analyzer
def lucene_search(index_dir, limit, query_text): ''' lucene_search: Search a built index and return upto limit number of responses Arguments: Input index folder, limit value of results returned, query(as string) Returns: paths of responsive files as list ''' logging.basicConfig(file=os.path.join(index_dir,"lucene_search.log")) logger.info("Initializing search....") lucene.initVM() logger.info("Reading index from "+index_dir) index = SimpleFSDirectory(File(index_dir)) analyzer = StandardAnalyzer(Version.LUCENE_30) #Lucene version used to generate index searcher = IndexSearcher(index) logger.info("Parsing query :"+ query_text) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query_text) hits = searcher.search(query, limit) logger.info("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) hit_paths = [] for hit in hits.scoreDocs: # The following code also generates score for responsive/found documents and the # content index which matched # print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) hit_paths.append(doc.get("path")) return hit_paths
def luceneRetriver(query): lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query = QueryParser(Version.LUCENE_30,"text",\ lucene_analyzer).parse(query) MAX = 1000 total_hits = lucene_searcher.search(my_query,MAX) print "Hits: ",total_hits.totalHits for hit in total_hits.scoreDocs: print "Hit Score: ",hit.score, "Hit Doc:",hit.doc, "Hit String:",hit.toString() doc = lucene_searcher.doc(hit.doc) print doc.get("text").encode("utf-8")
def retrieve(indexdir, queries): lucene.initVM() f = open("results_lucene.txt", "w") analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(indexdir))) searcher = IndexSearcher(reader) fields = ["title", "abstract", "authors"] st = PorterStemmer() for id, q in queries.iteritems(): query = q tokenizer = RegexpTokenizer(r'\w+') qwords = tokenizer.tokenize(query) qwords_k = [st.stem(q) for q in qwords] query = " ".join(qwords_k) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) MAX = 1000 hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for i, hit in enumerate(hits.scoreDocs): f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score)) # print hit.doc+1, hit.score # doc = searcher.doc(hit.doc) # print doc.get("authors").encode("utf-8") f.close()
def search(): lucene.initVM(vmargs=['-Djava.awt.headless=true']) args = [] if request.method == 'POST': if request.form['ies']: args.append('+ies:'+request.form['ies']) if request.form['area']: args.append('+area:'+request.form['area']) if request.form['professor']: args.append('+professor:'+request.form['professor']) if request.form['conceito']: #args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito']) args.append('m:'+request.form['conceito']) args.append('d:'+request.form['conceito']) args.append('f:'+request.form['conceito']) table = [] if(len(args) > 0): scoreDocs = mansearch.buscar('indexer/',args) fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table.append(dict((field.name(), field.stringValue()) for field in doc.getFields())) return render_template('busca.html',table = table) pass
def create_index(storage, paths) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths : for filen in os.listdir(path) : text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3) : doc = Document() a = i-5 if i-5 > 0 else 0 sentence = ' '.join(text[a:i+5]) doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path+filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def irsolver(data_file, index) : from questions import get_input_data lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) pred = [] mapp = { 1 : 'A', 2 : 'B', 3 : 'C', 4 : 'D'} idx, ques, ans = get_input_data(data_file) for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)) : max_score = -1000000 best_ans = 'A' for i, ai in enumerate(a): sc = query(q, ai, analyzer, searcher) print(acm, i, sc) if sc > max_score : max_score = sc best_ans = mapp[i+1] pred.append(best_ans) return idx, pred
def build_words_index(self): relevant_words = self.process_texts() # Initialize lucene and JVM lucene.initVM() # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Get index storage store = SimpleFSDirectory(File(self.index_words)) # Get index writer config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(store, config) try: for word in relevant_words: time_series = TimeSeries(word).get_series() series_str = '' for t in time_series: series_str += str(t) + ':' + str(time_series[t]) + '\t' doc = Document() # Add a fields to this document doc.add(Field('word', word, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field('series', series_str, Field.Store.YES, Field.Index.ANALYZED)) # Add the document to the index writer.addDocument(doc) except Exception, e: print "Failed in creating document to add to the index:", e
def __init__(self, root, store_dir): if not os.path.exists(store_dir): os.mkdir(store_dir, 0777) # NOTE: Hardcoded the analyzer instead of passing it lucene.initVM() ''' vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() ''' analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) store = SimpleFSDirectory(File(store_dir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) # Set the permissions to 777 for the index directory and the write.lock file chmod_indexdir_cmd = "chmod 0777 " + store_dir writelock_file = store_dir + "/" + "write.lock" chmod_writelock_cmd = "chmod 0777 " + writelock_file if os.path.exists(store_dir): cicmd=os.popen("sudo -S %s"%(chmod_indexdir_cmd), 'w').write('vagrant') if os.path.exists(writelock_file): cwcmd=os.popen("sudo -S %s"%(chmod_writelock_cmd), 'w').write('vagrant') # setting CREATE will rewrite over the existing indexes. ###config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) writer.close()
def search(request, template_name="reviews/search.html"): """ Searches review requests on Review Board based on a query string. """ query = request.GET.get("q", "") siteconfig = SiteConfiguration.objects.get_current() if not siteconfig.get("search_enable"): # FIXME: show something useful raise Http404 if not query: # FIXME: I'm not super thrilled with this return HttpResponseRedirect(reverse("root")) import lucene # We may have already initialized lucene try: lucene.initVM(lucene.CLASSPATH) except ValueError: pass index_file = siteconfig.get("search_index_file") store = lucene.FSDirectory.getDirectory(index_file, False) try: searcher = lucene.IndexSearcher(store) except lucene.JavaError, e: # FIXME: show a useful error raise e
def ExportIndex(b_print = False,b_write_file = False,b_filter = True): _dict = ReadConfig() initVM() try: if(b_write_file == True): output_file = _dict['resultDir'] + '/' + sys.argv[1] + '.xls' _fw = open(output_file,'w') directory = SimpleFSDirectory(File(_dict['indexDir'])) ireader = IndexReader.open(directory) # Enum all the terms all_terms = ireader.terms() word_dict = {} _stopword_set = ImportStopword() # SetPrint(_stopword_set) while all_terms.next(): term_elem = all_terms.term() if term_elem.field() == sys.argv[1]: _temp = term_elem.text().rstrip() word_dict[_temp] = all_terms.docFreq() if(b_filter == True): StopwordFilter(word_dict,_stopword_set) if(b_print != False): DictPrint(word_dict) if(b_write_file != False): DictPrint(word_dict,out_file=_fw) _fw.close() all_terms.close() return word_dict except Exception,e: print "Failed: ",e traceback.print_exc(file=sys.stdout)
def start_jvm(clspath='', vmargs=''): """Starts the JVM - note that only the first initVM() is effective for java VM! Make sure you pass important arguments for the first call, because they will set the environment of the Java VM @keyword clspath: platform-separator separated values that will be passed to the Java VM @keyword vmargs: other arguments, comma-separated to pass to the java VM @return: JVM object (either new one or already existing one)""" #initialize the JVM if not already initialized jvm = _jcc_module.getVMEnv() if not jvm: classpath = [] if clspath: classpath.append(clspath) if dumeanj: classpath.append(dumeanj.CLASSPATH) if lucene: classpath.append(lucene.CLASSPATH) jvm = _jcc_module.initVM(os.pathsep.join(classpath), vmargs=vmargs) if lucene != _jcc_module: lucene.initVM(lucene.CLASSPATH) elif vmargs: raise Exception('initVM() was already started, the second call will be ineffective. Please make sure you are initializing components in the right order!') return jvm
def search(r, keyword=""): import logging logger = logging.getLogger("search") bench = Benchmark(logger) from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit import lucene, os os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17" lucene.initVM(lucene.CLASSPATH) directory = FSDirectory.open(File(CONFIG.INDEX_PATH)) ROBOT_INDEX = IndexSearcher(directory, True) ROBOT_ANALYZER = StandardAnalyzer() keyword = keyword or r.GET["keyword"] query = QueryParser("context", ROBOT_ANALYZER) query = query.parse('"%s"' % keyword) bench.start_mark("search") hits = ROBOT_INDEX.search(query) count = len(hits) result = [] i = 0 for hit in hits: i += 1 if i > 100: break doc = Hit.cast_(hit).getDocument() result.append(SearchResult(doc, i, keyword)) ROBOT_INDEX.close() et = bench.stop_mark() return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
def wikipedia_indexer(storage, wikipedia_file) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f) : text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0 : print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def index(string): lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) try: writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512)) except lucene.JavaError: #print 'Inside Index Except' writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) #e = sys.exc_info()[0] #print e #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() doc = Document() doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) #print 'In the index function' #print writer.numDocs() #print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) #print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() #print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() #print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() #print 'ending Indexing' #print string #print 'Total indexes' #print writer.numDocs() writer.close()
def _init_search(cls): """ Initializes everything needed for search. """ config_path = cls.search_config_path if not os.path.exists(config_path): raise OSError(errno.ENOENT, "Config %r does not exist." % config_path, config_path) config_dir = os.path.dirname(config_path) # Read config. with open(config_path, 'rb') as fh: config = json.load(fh) cls.search_config = config # Connect to mongo. host = config['mongo']['host'] port = config['mongo'].get('port', None) or 27017 thread_pool = reactor.getThreadPool() pool_size = int(math.ceil((thread_pool.min + thread_pool.max) / 2)) cls.search_mongo = txmongo.lazyMongoConnectionPool(host=host, port=port, pool_size=pool_size) cls.search_order_db = cls.search_mongo[config['mongo']['order_dbname']] cls.search_order_tb = cls.search_order_db[config['mongo']['order_tbname']] # Initialize PyLucene. lucene.initVM() # Open index. index_path = os.path.abspath(os.path.join(config_dir, config['lucene']['index_path'])) if not os.path.exists(index_path): raise OSError(errno.ENOENT, "Index %r does not exist." % index_path, index_path) elif not os.path.isdir(index_path): raise OSError(errno.ENOTDIR, "Index %r is not a directory." % index_path, index_path) index_dir = lucene.NIOFSDirectory(lucene.File(index_path)) #index_dir = lucene.SimpleFSDirectory(lucene.File(index_path)) # windows cls.search_searcher = lucene.IndexSearcher(index_dir)
def build_lda_corpus(index_folder, paths_index_file, dictionary_file, ldac_file, min_frequency, min_word_len, max_word_len=20): ''' The main function that does the job! ''' initVM() store = SimpleFSDirectory(File(index_folder)) index_reader = IndexReader.open(store) # Stores the file paths index (for LDA) _store_file_paths_index(index_reader, paths_index_file) # Creates the dictionary _create_dictionary(index_reader, dictionary_file, min_frequency, min_word_len, max_word_len) # Creates the corpus dictionary = corpora.Dictionary().load(dictionary_file) # doesn't load the corpus into the memory! corpus_memory_friendly = _TextCorpus(dictionary, index_reader) corpora.BleiCorpus.serialize(ldac_file, corpus_memory_friendly, id2word=dictionary) logging.info('The Enron corpus building is completed.')
def search(self): ''' Searches the given query in the index ''' lucene.initVM(vmargs=['-Djava.awt.headless=true']) # print 'lucene', lucene.VERSION # base_dir = os.path.dirname(os.path.abspath('.')) base_dir = '.' directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) # print 'path:', doc.get("path"), 'name:', doc.get("name") print doc
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q,a,t,p in qatp: if n % 100 == 0: print 'finding candidates sample', n n+=1 q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def __init__(self): # Create index directory lucene.initVM(lucene.CLASSPATH) if not os.path.exists(STORE_DIR): os.mkdir(STORE_DIR) self.store = lucene.SimpleFSDirectory(lucene.File(STORE_DIR)) self.im = IndexManager()
def build_index(): lucene.initVM() # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/' post_dir = '/Users/w3/data/github/codeif_backup' index_store_dir = current_app.config['INDEX_STORE_DIR'] print post_dir print index_store_dir analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) store = SimpleFSDirectory(File(index_store_dir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) indexDocs(post_dir, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
# -*- coding: cp949 -*- import os, os.path, sys os.environ['PATH'] = os.path.join(os.environ['JAVA_HOME'], r'jre\bin\client') + ';' + os.environ['PATH'] import lucene lucene.initVM(lucene.CLASSPATH) # Initialize JVM def IndexCreate(fileDir, indexDir): analyzer = lucene.StandardAnalyzer() # 루씬에서 사용하는 객체 생성 store = lucene.FSDirectory.getDirectory(indexDir) writer = lucene.IndexWriter(store, analyzer) for root, dirnames, filenames in os.walk(fileDir): # 입력받은 폴더에서 텍스트 파일만 검색 for filename in filenames: if not filename.endswith('.txt'): continue print("Adding: %s" % filename) try: path = os.path.join(root, filename) f = open(path) content = f.read() f.close() content = content.decode('cp949').encode('utf-8') # 인코딩을 'utf-8'로 변경 doc = lucene.Document() # Document 객체 추가 doc.add(lucene.Field( "name", # 파일명 filename, lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add(lucene.Field( "path", # 파일 경로
def create_index_from_folder(folder, index_file): """Lets Lucene create an index of all database files within a specified folder :param folder: absolute or relative path to database files :param index_file: absolute or relative output location for index Notes: - Does not go through database folder recursively, i.e. all files have to be at the root of the folder - Only CSV files are supported - Column headers are hardcoded and should follow: ID, text, Reddit ID, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold """ # Set up Lucene print() print("Starting Lucene ...") lucene.initVM() index_store = SimpleFSDirectory.open(File(index_file).toPath()) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) print() # Go through files, add rows of each as Documents to writer for file in os.listdir(folder): if file.endswith(".csv"): print("Indexing {} ...".format(file), end=" ", flush=True) with open(os.path.join(folder, file), newline='') as db: reader = csv.reader(db) # The Reddit database seems to carry a lot of duplicate posts, so we try to skip those post_ids = set() duplicate_counter = 0 # To store term vectors (used for query expansion) we have to use a custom fieldtype customfield = FieldType() customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS) customfield.setStored(True) customfield.setTokenized(True) customfield.setStoreTermVectors(True) # CSV files have a useless first row... skipfirst = True # ... and a useless first column. Skip both. for _, text, rid, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold in reader: if skipfirst: skipfirst = False continue doc = Document() if rid in post_ids: duplicate_counter += 1 continue # skip else: post_ids.add(rid) # Tokenize, index and store doc.add(Field("text", text, customfield)) # Index and store doc.add(StringField("id", rid, Field.Store.YES)) doc.add( StringField("subreddit", subreddit, Field.Store.YES)) doc.add(StringField("meta", meta, Field.Store.YES)) doc.add(StringField("time", time, Field.Store.YES)) doc.add(StringField("author", author, Field.Store.YES)) # Store only doc.add(StoredField("ups", ups)) doc.add(StoredField("downs", downs)) doc.add(StoredField("authorlinkkarma", authorlinkkarma)) doc.add(StoredField("authorkarma", authorkarma)) doc.add(StoredField("authorisgold", authorisgold)) writer.addDocument(doc) print("DONE!\t(Duplicate posts skipped: {})".format( duplicate_counter)) writer.commit() writer.close() print() print("Finished indexing!")
# noinspection PyUnresolvedReferences from org.apache.lucene.search import IndexSearcher, PhraseQuery, RegexpQuery # noinspection PyUnresolvedReferences from org.apache.lucene.search.spans import SpanMultiTermQueryWrapper, SpanNearQuery # noinspection PyUnresolvedReferences from org.apache.lucene.index import DirectoryReader, Term # noinspection PyUnresolvedReferences from org.apache.lucene.store import FSDirectory # noinspection PyUnresolvedReferences from org.apache.lucene.queryparser.classic import QueryParser # noinspection PyUnresolvedReferences from org.apache.lucene.analysis.standard import StandardAnalyzer if __name__ == "__main__": # noinspection PyUnresolvedReferences lucene.initVM(initialheap='32m', maxheap='4G') file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT") dir = FSDirectory.open(file) reader = DirectoryReader.open(dir) searcher = IndexSearcher(reader) term = Term("contents", "tiger") print(f'Tiger frequency: {reader.totalTermFreq(term)}') q_regex = RegexpQuery(Term("contents", "[0-9]+\.?[0-9]*")) print(f'regex results: {searcher.search(q_regex,1000000).totalHits}') span1 = SpanMultiTermQueryWrapper(q_regex) span2 = SpanMultiTermQueryWrapper(RegexpQuery(Term("contents", "tiger"))) spannearquery = SpanNearQuery([span1, span2], 20, True) print(
from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.document import Document, Field, FieldType from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher, TermQuery, PhraseQuery, PrefixQuery, FuzzyQuery from org.apache.lucene.search import WildcardQuery import lucene from org.apache.lucene import analysis, document, index, queryparser, search, store from org.apache.lucene.document import Document, Field, StringField, TextField from org.apache.lucene.search import IndexSearcher, TermQuery, PhraseQuery from org.apache.lucene.index import (IndexWriter, IndexReader, DirectoryReader, Term, IndexWriterConfig) from lupyne import engine from tqdm import tqdm import unicodedata assert lucene.getVMEnv() or lucene.initVM() ################################################################ # ENVIRON ################################################################ # This is gpu server1_homepath = "/home/ubuntu/workspace/codelab/" server2_homepath = "/home/ubuntu/workspace/codelab/" gpu_homepath = "/home/shawn/workspace/research/final_codelab/" jun_homepath = "/home/junw/workspace/codelab/" # choose from the server1, server2, gpu, jun. SERVERNAME = 'server1' HOMEPATH = { 'server1': server1_homepath, 'server2': server2_homepath, 'gpu': gpu_homepath,
command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'path:', doc.get("path") print 'name:', doc.get("name") print 'title:', doc.get('title') print 'author:', doc.get('author') print 'language:', doc.get('language') ## print explanation if __name__ == '__main__': STORE_DIR = "index" initVM() print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) run(searcher, analyzer) searcher.close()
lucene.Field.Index.ANALYZED)) doc.add(lucene.Field("url", url, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("urltitle", title, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中 print "----------------------------------------------------" except Exception, e: print "Failed in indexDocs:", e else: break t.close() if __name__ == '__main__': ## if len(sys.argv) < 2: ## print IndexFiles.__doc__ ## sys.exit(1) lucene.initVM() #初始化Java虚拟机 print 'lucene', lucene.VERSION start = datetime.now() try: ## IndexFiles(sys.argv[1], "index", lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)) # IndexFiles('html', "index", lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)) IndexFiles('html', "index for pic", lucene.WhitespaceAnalyzer(lucene.Version.LUCENE_CURRENT)) end = datetime.now() print end - start except Exception, e: print "Failed: ", e
def initVM(): vm_env = lucene.getVMEnv() if vm_env: vm_env.attachCurrentThread() else: lucene.initVM(vmargs=['-Djava.awt.headless=true'])
def __init__(self, dir_file_path): lucene.initVM() self.directory = lucene.SimpleFSDirectory(lucene.File(dir_file_path)) self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_30) self.search = lucene.IndexSearcher(self.directory)
def __init__(self): lucene.initVM()
def __init__(self, xmlpath, indexpath, ItemClass): self.jccenv = lucene.initVM() self.xmlpath = xmlpath self.indexpath = indexpath self.ItemClass = ItemClass
help='optional configuration file or json object of global params') parser.add_argument( '--autoreload', type=float, metavar='SECONDS', help='automatically reload modules; replacement for engine.autoreload', ) parser.add_argument( '--autoupdate', type=float, metavar='SECONDS', help='automatically update index version and commit any changes') parser.add_argument('--real-time', action='store_true', help='search in real-time without committing') args = parser.parse_args() read_only = args.read_only or len(args.directories) > 1 kwargs = {'nrt': True} if args.real_time else {} if read_only and (args.real_time or not args.directories): parser.error('incompatible read/write options') if args.config and not os.path.exists(args.config): args.config = {'global': json.loads(args.config)} assert lucene.initVM(vmargs='-Xrs,-Djava.awt.headless=true') cls = WebSearcher if read_only else WebIndexer root = cls(*args.directories, **kwargs) start(root, config=args.config, autoreload=args.autoreload, autoupdate=args.autoupdate)
elif sys.argv[i] == "-co" or sys.argv[i] == "--check-only": sw.checkonly = True elif sys.argv[i] == "-u" or sys.argv[i] == "--update": sw.doupdate = True elif sys.argv[i] == "-srp" or sys.argv[i] == "--save-range-partition": sw.saveRP = True elif sys.argv[i] == "-i" or sys.argv[i] == "--index": sw.index = True # bind token and set the api sw.setToken(sw.sinaweiboOauth["oauth_token"], sw.sinaweiboOauth["oauth_token_secret"]) # initialize the indexer, if needed if sw.index: lucene.initVM(lucene.CLASSPATH) sw.indexer = sinaweibolucene.IndexSinaWeibo() # dispatch if sw.force_screenname: out = sw.dispatch(opt, fname, output_counts) out = [out] elif id > 0: out = sw.dispatch(opt, id, output_counts) out["id"] = id out = [out] # put in an array for consistency with list of ids else: try: f = open(fname, "r") except IOError: print sw.usage
def lucene_indexing(): lucene.initVM() whole_tokenized_db_cursor = wiki_db_tool.get_cursor( config.WHOLE_PROCESS_FOR_RINDEX_DB) whole_tokenized_db_cursor.execute("SELECT * from unnamed") indexDir = SimpleFSDirectory(Paths.get(str(config.LUCENE_INDEXED))) analyzer = PorterStemmerAnalyzer() writerConfig = IndexWriterConfig(analyzer) writer = IndexWriter(indexDir, writerConfig) print("Building lucene index ...") with SqliteDict(str(config.WHOLE_WIKI_DB), flag='r', encode=json.dumps, decode=json.loads) as whole_wiki_db: for key, value in tqdm(whole_tokenized_db_cursor, total=config.TOTAL_ARTICLE_NUMBER_WHOLE): item = json.loads(value) article_title = item['title'] article_clean_text = item['clean_text'] article_poss = item['poss'] # TODO: change it to extract abstract wiki? # get the first paragraph which has the length >= 50? so weired. abs_index = get_first_paragraph_index(whole_wiki_db[article_title]) if abs_index == -1: # document too short valid_page = False # only title title_term_list = [] title_poss_list = [] # only abstract content abstract_term_list = [] abstract_poss_list = [] assert len(article_clean_text) == len(article_poss) for p_i, (paragraph_text, paragraph_poss) in enumerate( zip(article_clean_text, article_poss)): for sent_text, sent_poss in zip(paragraph_text, paragraph_poss): if p_i == 0: # In title. title_term_list.extend(sent_text) title_poss_list.extend(sent_poss) continue # If the terms are in title, we don't include those terms in abstract and article term. else: if p_i == abs_index: # If the terms are in abstract abstract_term_list.extend(sent_text) abstract_poss_list.extend(sent_poss) added_title = article_title added_text = " ".join(title_term_list + abstract_term_list) doc = Document() doc.add(Field("title", added_title, StoredField.TYPE)) doc.add(Field("text", added_text, TextField.TYPE_STORED)) writer.addDocument(doc) writer.close()
print "warning: no content in sentence %d of file %s" % sentence_num, filename writer.addDocument(doc) sentence_num += 1 except Exception, e: #print "Failed in indexDocs:", e error = 1 print "Index has Added " + str(docindex_num) + " files..." if __name__ == '__main__': # 即可以作为主程序运行,也可以作为模块导入 if len(sys.argv) < 5: print IndexFiles.__doc__ sys.exit(1) startDate = datetime.strptime(sys.argv[3], '%Y%m%d') endDate = datetime.strptime(sys.argv[4], '%Y%m%d') lucene.initVM() # 初始化java虚拟机 print 'lucene', lucene.VERSION start = datetime.now() try: IndexFiles(sys.argv[1], sys.argv[2], lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), startDate, endDate) end = datetime.now() print end - start except Exception, e: print "Failed: ", e # os.system("pause")
def indexFile(self): self._th=lucene.initVM() self._analyzer = StandardAnalyzer(Version.LUCENE_36) self._dir = RAMDirectory() self._writer = IndexWriter(self._dir, self._analyzer, True, IndexWriter.MaxFieldLength(25000))
def post(self): q = self.get_argument("query") k = self.get_argument("kTerms") # self.write(key) # def query(query): # query = self.get_argument("q") lucene.initVM() indexDir = "index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 10 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % ( hits.totalHits, query) items = [] rQ = [] #for key, value in doc_urls.iteritems() # print (key, value) for hit in hits.scoreDocs: #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]}) print hit.score, hit.doc, hit.toString() print(len(doc_urls)) items.append(doc_urls[str(hit.doc)]) print(doc_urls[str(hit.doc)]) doc = searcher.doc(hit.doc) print(hit.doc) rQ.append("html_files/" + str(hit.doc)) i = 0 rqSize = 0 for url in rQ: rqSize = rqSize + 1 print(url) f = codecs.open(url, 'r') html = f.read() html = html.decode('utf-8') tag_free = strip_tags(html) path = 'strippedHTML_files' if not os.path.exists(path): os.makedirs(path) filename = str(i) with open(os.path.join(path, filename), 'wb') as temp_file: temp_file.write(tag_free.encode('utf-8')) i = i + 1 path = 'strippedHTML_files' i = 0 for filename in os.listdir(path): with open(os.path.join(path, filename), 'r') as myfile: data = myfile.read() stripStopWords(data, i) i = i + 1 if k > 0: newQuery = calcNewQuery(k, q, rqSize) q = newQuery print("new query is ") print(q) self.render("index.html", title="Results", items=items, query=q, kTerms=k)
def GET(self, query): data_input = web.input() page = 0 if "page" in data_input: page = int(data_input["page"]) render = web.template.render('templates/') anses = [] num_pages = 0 if use_elasticsearch: # importing libraries for Elasticsearch from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections from booktype import Book es = Elasticsearch() es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) # print(connections.get_connection().cluster.health()) s = Search(es).index('book-index').doc_type('book').query( Q('match', title=query.strip()) | Q('match', description=query.strip()) | Q("match", userreviews_userReview=query.strip())) ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute() s = s[page * 10:page * 10 + 10] response = s.execute() # print 'total number of hits: ', response.hits.total num_pages = (response.hits.total / 10) + 1 for res in response: authors = zip(res.authors_name, res.authors_url) anses.append({ 'title': res.title, 'description': res.description.encode('utf-8'), 'url': res.url, 'cover': res.cover, 'authors': authors }) else: # importing libraries for Lucene import lucene from java.io import File from org.apache.lucene.index import DirectoryReader, Term from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery from org.apache.lucene.util import Version from org.apache.lucene.analysis.standard import StandardAnalyzer import os # fields title_field = 'title' description_field = 'description' cover_field = 'cover' authors_name_field = 'authors_name' authors_url_field = 'authors_url' url_field = 'url' index_folder = '.' index_name = 'lucene.index' index_path = os.path.join(index_folder, index_name) lucene.initVM() version = Version.LUCENE_CURRENT directory = SimpleFSDirectory(File(index_path)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(version) title_tq = TermQuery(Term(title_field, query)) desc_tq = TermQuery(Term(description_field, query)) query = BooleanQuery() query.add(BooleanClause(title_tq, BooleanClause.Occur.SHOULD)) query.add(BooleanClause(desc_tq, BooleanClause.Occur.SHOULD)) scoreDocs = searcher.search(query, 1000).scoreDocs num_pages = (len(scoreDocs) / 10) + 1 for scoreDoc in scoreDocs[page * 10:page * 10 + 10]: doc = searcher.doc(scoreDoc.doc) authors = zip([doc.get(authors_name_field)], [doc.get(authors_url_field)]) anses.append({ 'title': doc.get(title_field), 'description': doc.get(description_field).encode('utf-8'), 'url': doc.get(url_field), 'cover': doc.get(cover_field), 'authors': authors }) return render.index(anses, query, num_pages)
def index_ontology_files(oboFile, outDir, xref_map): """ Iterates over our list of ontology files and creates an index for each file. """ lucene.initVM() analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) # Handle a little bit of lucene setup filename, _ext = os.path.splitext(os.path.basename(oboFile)) indexDir = os.path.join(outDir, filename) if os.path.exists(indexDir): raise ExistingIndexDirectoryException( 'Error, attempted to index same file twice or index two files named the same' ) dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = lucene.IndexWriter(dir, analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) for term in oboparser.parse(oboFile, ['is_a']): if term.obsolete: continue doc = lucene.Document() add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 4.0) # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could # query hits that we would not want to occur thus errantly increasing the score of the field. # We will strip out these hyperlinks and index just the text. add_field_to_document(doc, "definition", strip_urls_from_text(term.definition), lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 0.4) # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists # in our Ontology object and need to be entered in one at a time add_fields_to_document(doc, "synonym", [x[0] for x in term.synonyms if x], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED, 0.7) add_fields_to_document(doc, "alt_id", term.alternateIds, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "xref", [replace_xref_identifier(x, xref_map) for x in term.xrefs], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "relationship", [" ".join(list(x)) for x in list(term.relationships)], lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED) add_fields_to_document(doc, "subset", term.subsets, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) writer.addDocument(doc) writer.optimize() writer.close()
lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("path", path, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) if len(contents) > 0: doc.add( lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e if __name__ == '__main__': if len(sys.argv) < 2: print IndexFiles.__doc__ sys.exit(1) lucene.initVM() print 'lucene', lucene.VERSION start = datetime.now() try: IndexFiles(sys.argv[1], "index", lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)) end = datetime.now() print end - start except Exception, e: print "Failed: ", e
def ansSearch(command, prior, page, RPP): if searcher_ans.vm == None: searcher_ans.vm = initVM() searcher_ans.vm.attachCurrentThread() return searcher_ans.Searchfile(command, prior, page, RPP)
query1 = ''.join(query) command = query1 os.remove('search.txt') if command == '': return print start = datetime.now() print("Searching for:", command) query = QueryParser("contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print("%s total matching documents." % len(scoreDocs)) end = datetime.now() for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print(doc.get("path"), 'name:', doc.get("name")) print('done...') print(end - start) if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) print('lucene', lucene.VERSION) base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer() run(searcher, analyzer) del searcher
if result: #DO_NOT_DISTRIBUTE exit(result) #DO_NOT_DISTRIBUTE sysPath.insert(0, libDir) #DO_NOT_DISTRIBUTE from os import getenv from warnings import warn maxheap = getenv('PYLUCENE_MAXHEAP') if not maxheap: maxheap = '4g' warn( "Using '4g' as maxheap for lucene.initVM(). To override use PYLUCENE_MAXHEAP environment variable." ) from lucene import initVM, getVMEnv try: VM = initVM(maxheap=maxheap) #, vmargs='-agentlib:hprof=heap=sites') except ValueError: VM = getVMEnv() from meresco_lucene import initVM VMM = initVM() from fieldregistry import SORTED_PREFIX, UNTOKENIZED_PREFIX, KEY_PREFIX, NUMERIC_PREFIX from _version import version from luceneresponse import LuceneResponse from _lucene import Lucene from lucenesettings import LuceneSettings from fields2lucenedoc import Fields2LuceneDoc from cqltolucenequery import CqlToLuceneQuery from multilucene import MultiLucene from termnumerator import TermNumerator
def __init__(self, storeDir): lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION self.dir = SimpleFSDirectory(File(storeDir))
op.add_option("-i", dest='create_index', default=False, action='store_true', help="create index; not search") op.add_option("--maxheap", dest='maxheap', default='8g', help="min ram for the VM") op.add_option("--max_n", dest='max_n', default=MAX_N, help="max return search item") opts, args = op.parse_args(sys.argv) lucene.initVM(maxheap=opts.maxheap) print('lucene', lucene.VERSION) start = datetime.now() if opts.exact_match: print("creating keyworkanalyzer -> exact match on %s" % DEFAULT_SEARCH_FIELD) analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT) else: print("creating stdanalyzer -> keyword match on %s" % DEFAULT_SEARCH_FIELD) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) fname = os.path.join(base_dir, opts.index_dir)
if ikeyword=="": return render.result_text(ikeyword, [[]], 0) vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() results = search_site(ikeyword) print len(results[0]) length = len(results) return render.result_text(ikeyword, results, length) class image: def POST(self): i = web.input(myfile={}) f=open('target.jpg','w') f.write(str(i['myfile'].value)) f.close() ikeyword='blank' if i.myfile.value == "": return render.result_pic(ikeyword,[[]], 0) vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() results = search_img(match_imgs("target.jpg")) length = len(results) return render.result_pic(ikeyword, results, length) if __name__ == "__main__": vm_env = lucene.initVM() app = web.application(urls, globals()) app.run()
def ready(self): settings.JVM = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
idfpath = "D:\\ICTCLAS\\wordIDF_MBStrategy.txt" totalfile = 659796 stockcodeflag = 1 if sourcedata == "股票论坛".decode('utf8').encode('gbk'): STORE_DIR = "D:\\DATA\\Index\\text" idfpath = "D:\\ICTCLAS\\wordIDF_text.txt" totalfile = 1487094 stockcodeflag = 1 if sourcedata == "个股新闻".decode('utf8').encode('gbk'): STORE_DIR = "D:\\DATA\\Index\\sinaStockNews" idfpath = "D:\\ICTCLAS\\wordIDF_sinaStockNews.txt" totalfile = 441061 stockcodeflag = 0 #------------- Lucene Init ----------------- initVM(maxheap='512m') print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) print "Lucene Search Init Done..." #------------- Divde Word Init ----------------- divide = cdll.LoadLibrary("D:\\ICTCLAS\\ICTCLAS50.dll") IfInit = divide.ICTCLAS_Init(c_char_p("D:\\ICTCLAS")) wordcount = divide.ICTCLAS_ImportUserDictFile('D:\\ICTCLAS\\userdict.txt', "CODE_TYPE_UTF8") print "Divde Word Init Done..." print #------------- Net Word Generate -----------------
def __init__(self, DATA_DIR, vocab, n_threads, max_terms_per_doc, index_name, index_name_term, docs_path, docs_path_term, use_cache): self.n_threads = n_threads self.index_folder = DATA_DIR + '/data/' + index_name + '/' # folder to store lucene's index. It will be created in case it does not exist. self.index_folder_term = DATA_DIR + '/data/' + index_name_term + '/' # folder to store lucene's index. It will be created in case it does not exist. self.local_index_folder = './' + index_name self.local_index_folder_term = './' + index_name_term self.use_cache = use_cache self.docs_path = docs_path self.docs_path_term = docs_path_term self.max_terms_per_doc = max_terms_per_doc self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = vocab BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(self.index_folder): print 'Creating index at', self.index_folder if self.docs_path == self.docs_path_term: add_terms = True else: add_terms = False self.create_index(self.index_folder, self.docs_path, add_terms) if self.local_index_folder: print 'copying index from', self.index_folder, 'to', self.local_index_folder if os.path.exists(self.local_index_folder): print 'Folder', self.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(self.index_folder, self.local_index_folder) self.index_folder = self.local_index_folder else: self.index_folder = self.index_folder fsDir = MMapDirectory(Paths.get(self.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if self.docs_path != self.docs_path_term: if not os.path.exists(self.index_folder_term): print 'Creating index at', self.index_folder_term self.create_index(self.index_folder_term, self.docs_path_term, add_terms=True) if self.local_index_folder_term: print 'copying index from', self.index_folder_term, 'to', self.local_index_folder_term if os.path.exists(self.local_index_folder_term): print 'Folder', self.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(self.index_folder_term, self.local_index_folder_term) self.index_folder_term = self.local_index_folder_term else: self.index_folder_term = self.index_folder_term fsDir_term = MMapDirectory(Paths.get(self.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=self.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map()
def main(use_elasticsearch=True, calculate_PageRank=False, tele_const=0.2): """ main entry for the indexer module. """ jsons_root_dir = 'JSONs/' # list of addresses of all json files all_json_dirs = glob.glob(jsons_root_dir + '*.json') # first reading all json files jsons = [] for jdir in all_json_dirs: with open(jdir, 'r') as f: jsn = json.load(f) jsons.append(jsn) print len(jsons), ' json files imported.' # now creating a set of all links and then a list of all links in json files print 'creating a list of all links' links_set = set() for js in jsons: links_set.add(js["url"]) for l in js["outlinks"]: links_set.add(l) print len(links_set), ' links found' links = list(links_set) ## if user has selected to index documents using Elasticsearch # Note that when using Elasticsearch, page rank is ignored if use_elasticsearch: from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections print 'Using Elasticsearch for indexing, PageRank is ignored' es = Elasticsearch() es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) Book.init('book-index') ## adding all document to the index 'book-index' for idx, js in enumerate(jsons): book = Book(average=js['average'], cover=js['cover'], description=js['description'].encode('utf-8'), ratings=js['ratings'], reviews=js['reviews'], title=js['title'], url=js['url'], outlinks=js['outlinks']) book.add_authors(js['authors']) book.add_userreviews(js['userreviews']) book.id = idx book.save() print 'Elasticsearch index created' ### use pyLucene instead else: import lucene from java.io import File from org.apache.lucene.index import IndexWriterConfig, IndexWriter, FieldInfo from org.apache.lucene.document import Document, Field, FieldType, IntField, FloatField from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version from org.apache.lucene.analysis.standard import StandardAnalyzer print 'Using Lucene for indexing' ## if user has selected to calculate the PageRank if calculate_PageRank: # now creating the unnormalized adjacency matrix print 'creating the unnormalized adjacency matrix.' adjacency = np.zeros((len(links_set), len(links_set))) for js in jsons: node_idx = links.index(js["url"]) for l in js["outlinks"]: out_idx = links.index(l) adjacency[node_idx, out_idx] += 1 print 'the unnormalized adjacency matrix created.' print 'normalizing the adjacency matrix with teleporting constant value of ', tele_const norm_mat = Normalize(adjacency, tele_const) print 'calculating the PageRank scores' pr_scores = PageRankScore(norm_mat) ## here goes the pyLucene code, which means I should swith to the damn Ubuntu index_folder = '.' index_name = 'lucene.index' index_path = os.path.join(index_folder, index_name) print 'initializing Lucene VM' lucene.initVM() print 'lucene version ', lucene.VERSION version = Version.LUCENE_CURRENT index_store = SimpleFSDirectory(File(index_path)) analyzer = StandardAnalyzer(version) config = IndexWriterConfig(version, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) # Options TokenizeFields = True # Title field type title_field = 'title' tft = FieldType() tft.setIndexed(True) tft.setStored(True) tft.setTokenized(TokenizeFields) tft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS ) #only index the document and frequency data # Authors name field type authors_name_field = 'authors_name' anft = FieldType() anft.setIndexed(True) anft.setStored(True) anft.setTokenized(TokenizeFields) anft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # Authors url field type authors_url_field = 'authors_url' auft = FieldType() auft.setIndexed(False) auft.setStored(True) # Average rating field type average_field = 'average' # Cover Image URL field type cover_field = 'cover' cft = FieldType() cft.setIndexed(False) cft.setStored(True) # Book description field type description_field = 'description' descft = FieldType() descft.setIndexed(True) descft.setStored(True) descft.setTokenized(TokenizeFields) descft.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Outlinks field type outlinks_field = "outlinks" outft = FieldType() outft.setIndexed(False) outft.setStored(True) # Ratings count field type ratings_field = 'ratings' # Reviews count field type reviews_field = 'reviews' # URL field type url_field = 'url' uft = FieldType() uft.setIndexed(False) uft.setStored(True) # userreviews.userName field type userreviews_userName_field = 'userreviews_userName' usunft = FieldType() usunft.setIndexed(False) usunft.setStored(True) #userreviews.userReview field type userreviews_userReview_field = 'userreviews_userReview' usurft = FieldType() usurft.setIndexed(True) usurft.setStored(False) usurft.setTokenized(TokenizeFields) usurft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #userreviews.userReviewDate field type userreviews_userReviewDate_field = 'userreviews_userReviewDate' usudft = FieldType() usudft.setIndexed(False) usudft.setStored(True) #userreviews.userURL field type userreviews_userURL_field = 'userreviews_userURL' usuuft = FieldType() usuuft.setIndexed(False) usuuft.setStored(True) docid_field = 'docid' for idx, js in enumerate(jsons): boostVal = js['average'] if calculate_PageRank: boostVal *= pr_scores[links.index(js['url'])] doc = Document() for author in js['authors']: doc.add(Field(authors_name_field, author['name'], anft)) doc.add(Field(authors_url_field, author['url'], auft)) doc.add( FloatField(average_field, float(js['average']), Field.Store.YES)) doc.add(Field(cover_field, js['cover'], cft)) df = Field(description_field, js['description'], descft) df.setBoost(boostVal) doc.add(df) for u in js['outlinks']: doc.add(Field(outlinks_field, u, outft)) doc.add(IntField(ratings_field, js['ratings'], Field.Store.YES)) doc.add(IntField(reviews_field, js['reviews'], Field.Store.YES)) tf = Field(title_field, js['title'], tft) tf.setBoost(boostVal) doc.add(tf) doc.add(Field(url_field, js['url'], uft)) for rev in js['userreviews']: doc.add( Field(userreviews_userName_field, rev['userName'], usunft)) doc.add( Field(userreviews_userReview_field, rev['userReview'], usurft)) doc.add( Field(userreviews_userReviewDate_field, rev['userReviewDate'], usurft)) doc.add( Field(userreviews_userURL_field, rev['userURL'], usuuft)) doc.add(IntField(docid_field, idx, Field.Store.YES)) writer.addDocument(doc) print 'lucene index created' writer.commit() writer.close() print 'writing lucene indexing finished'
return print print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------------' print 'title:', doc.get('title') print 'url:', doc.get('url') print 'src:', doc.get('src') if __name__ == '__main__': STORE_DIR = "image_index_v3" initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) run(searcher, analyzer) searcher.close()