def wikipedia_indexer(storage, wikipedia_file) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f) : text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0 : print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def index(indexdir): lucene.initVM() indexDir = SimpleFSDirectory(File(indexdir)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer()) writer = IndexWriter(indexDir, writerConfig) f = open('data/docid.documento-xml.txt') st = PorterStemmer() for i, line in enumerate(f.readlines()): id, xmltext = line.split('\t') xmltext = xmltext.rstrip('\n') xmldoc = minidom.parseString(xmltext) title = xmldoc.getElementsByTagName("TITLE") title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue authors = xmldoc.getElementsByTagName("AUTHORS") authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue abstract = xmldoc.getElementsByTagName("ABSTRACT") abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue doc = Document() doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) print "indexed %s docs" % (i+1) writer.close()
def build_index(): lucene.initVM() # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/' post_dir = '/Users/w3/data/github/codeif_backup' index_store_dir = current_app.config['INDEX_STORE_DIR'] print post_dir print index_store_dir analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) store = SimpleFSDirectory(File(index_store_dir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) indexDocs(post_dir, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
class LuceneIndexer: def __init__(self, path_to_save): self.path_to_save = path_to_save self.num_docs = 0 lucene.initVM() self.indexDir = SimpleFSDirectory(File(self.path_to_save)) self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1) self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2) self.writer = IndexWriter(self.indexDir, self.writerConfig) def add_document(self, fields, header, id_): doc = Document() if len(fields) > len(header): sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_)) for field in fields: sys.stderr.write('%s\n' % field) return for idx, field in enumerate(fields): fname, fieldtype = header[idx] if fieldtype is IntField: field = int(field) doc.add(fieldtype(fname, field, Field.Store.YES)) self.writer.addDocument(doc) self.num_docs += 1 def close(self): print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs()) self.writer.close()
def __init__(self,root,storeDir,analyzer): # Create the index dir if it does not exist if not os.path.exists(storeDir): os.mkdir(storeDir) # the SimpleFSDirectory which the index will be written in store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer,1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # create a index writer # atach the index dir and config info to it writer = IndexWriter(store,config) # call the indexing procedure # indexing all the files in the directory specified by root # write the index with writer self.indexDocs(root,writer) # start a ticker ticker = Ticker() print 'commit index' threading.Thread(target=ticker.run).start() writer.commit() writer.close() # stop the ticker when the indexing procedure completes ticker.tick = False print 'Done'
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def reindex(self): writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED) indexutils.reindex_all(self.reader, writer, self.corpus.analyzer) writer.optimize() writer.close() self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)}) self.parent.write({'status': "Ready!"})
def __init__(self, root, store_dir): if not os.path.exists(store_dir): os.mkdir(store_dir, 0777) # NOTE: Hardcoded the analyzer instead of passing it lucene.initVM() ''' vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() ''' analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) store = SimpleFSDirectory(File(store_dir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) # Set the permissions to 777 for the index directory and the write.lock file chmod_indexdir_cmd = "chmod 0777 " + store_dir writelock_file = store_dir + "/" + "write.lock" chmod_writelock_cmd = "chmod 0777 " + writelock_file if os.path.exists(store_dir): cicmd=os.popen("sudo -S %s"%(chmod_indexdir_cmd), 'w').write('vagrant') if os.path.exists(writelock_file): cwcmd=os.popen("sudo -S %s"%(chmod_writelock_cmd), 'w').write('vagrant') # setting CREATE will rewrite over the existing indexes. ###config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) writer.close()
def create_index(storage, paths) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths : for filen in os.listdir(path) : text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3) : doc = Document() a = i-5 if i-5 > 0 else 0 sentence = ' '.join(text[a:i+5]) doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path+filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def removeindex(self, data): writer = IndexWriter( self.d, self.conf) writer.deleteDocuments(lucene.Term("_id", data['record']['_id'])) writer.optimize() writer.close()
def deleteRec(self, pid): config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(self.indexDir, config) writer.deleteDocuments(Term('uid', pid)) writer.commit() writer.close() self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir)) return
def updateindex(self, data): writer = IndexWriter( self.d, self.conf) doc = self.buildDocument(data['fields'], data['record']) writer.updateDocument(lucene.Term("_id", data['record']['_id']), doc) writer.optimize() writer.close()
def index(self, data): writer = IndexWriter( self.d, self.conf) doc = self.buildDocument(data['fields'], data['record']) writer.addDocument(doc) writer.commit() writer.close()
def rebuildIndex(self, data): writer = IndexWriter( self.d, self.conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)) for record in data['records']: doc = self.buildDocument(data['fields'], record) writer.addDocument(doc) writer.commit() writer.close()
def indexer(docNumber, docText): lucene.initVM() indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer()) writer = IndexWriter(indexDir, writerConfig) doc = Document() doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print "Closing index of %d docs..." % writer.numDocs() writer.close()
class Indexer(object): def __init__(self, **kwargs): """ Initialize a new instance of the Indexer :param output: The output directory of the underlying index :param anaylzer: The overloaded analyzer to work with """ self.output = kwargs.get("root", "index") if not os.path.exists(self.output): os.mkdir(self.output) self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT)) self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.store = SimpleFSDirectory(File(self.output)) self.writer = IndexWriter(self.store, self.config) self.create_field_types() def index(self, document): """ Given a new document, add it to the index. :param document: The document to add to the indexer """ try: self.writer.addDocument(document) except Exception: logger.exception("Failed to index the supplied document") def shutdown(self): """ Shutdown the currently processing indexer. """ try: # self.writer.optimize() self.writer.close() except Exception: logger.exception("Failed to shutdown the indexer correctly") def create_field_types(self): """ Create the field types that will be used to specify what actions lucene should take on the various fields supplied to index. """ self.field_clean = FieldType() self.field_clean.setIndexed(True) self.field_clean.setStored(True) self.field_clean.setTokenized(False) self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.field_dirty = FieldType() self.field_dirty.setIndexed(True) self.field_dirty.setStored(False) self.field_dirty.setTokenized(True) self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def make_index(indexed_data, index_destination, source='directory'): #index wiki articles based on ck 12 topics #analyzer = StandardAnalyzer(Version.LUCENE_30) analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET) indexWriterConfig = IndexWriterConfig(Version.LUCENE_30, analyzer) writer = IndexWriter(SimpleFSDirectory(File(index_destination)), indexWriterConfig) if source == 'directory': indexDirectory(indexed_data, writer) else: indexDictionary(indexed_data, writer) writer.close()
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) writer.commit() writer.close()
class WikiPageIndex(): def __init__(self, index_dir): #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g']) self.index_dir = index_dir self.directory = SimpleFSDirectory(File(self.index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) def createIndex(self): self.writer = IndexWriter(self.directory, self.config) if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) def addDocumentToIndex(self, title, text): doc = Document() doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.addDocument(doc) def closeIndex(self): self.writer.commit() self.writer.close() def searchIndex(self, queryString, field="Text", max_results=100): query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(queryString) scoreDocs = self.searcher.search(query, max_results).scoreDocs log.debug("Found {0} documents for query [{1}]".format(len(scoreDocs), queryString)) docs = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) log.debug(WikiPageIndex.cleanWikiText(doc.get("Text"))) #print("title: {0}\ncontents: {1}".format(doc.get("Title"), doc.get("Text")[:70])) docs.append(doc) return docs @staticmethod def cleanWikiText(text): text = text.encode('ascii', 'ignore') text = re.sub('(\[\[.*?\]\]|\{\{.*?\}\}|\{\|.*?\|\})', '', text) text = re.sub('[^\na-zA-Z0-9\n_-]+', ' ', text) text = re.sub('([ \t]*[\n]+[ \t]*)+', '\n', text) return text.strip()
def import_csv_with_content(self, csv_file, content_field): try: writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.analyzer, False, IndexWriter.MaxFieldLength.LIMITED) changed_rows = addmetadata.add_metadata_and_content_from_csv(self.searcher, self.reader, writer, csv_file, content_field, self.args_dir) writer.close() except UnicodeDecodeError: try: writer.close() except: pass self.parent.write({'error': 'CSV import failed: file contained non-unicode characters. Please save the file with UTF-8 encoding and try again!'}) return self.parent.write({'message': "CSV import complete: %s rows added." % (changed_rows,)})
def dummyIndex(self): """ Create a dummy index - to avoid problems updating it """ config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(self.indexDir, config) doc = Document() doc.add(Field('uid', 'dummy', StringField.TYPE_STORED)) writer.addDocument(doc) writer.commit() writer.close() return
def __init__(self, indexPath): """Instantiate the handler object.""" self.indexPath = indexPath self.analyzer = StopAnalyzer() # Make sure the path exists if not os.path.exists(self.indexPath): os.mkdir(self.indexPath) if not os.path.exists(os.path.join(self.indexPath, 'segments.gen')): log('Creating new index.') writer = IndexWriter(self.indexPath, self.analyzer, 1) writer.close()
def lucene_index(texts): """ :param corpus_file_path: :param f_type: :return: """ index = set_lucene_index['ind'] # nonlocal variable index config = IndexWriterConfig(version, analyzer) writer = IndexWriter(index, config) for t in texts: addDoc(writer, t) writer.close()
def buildIndex(self, inputFile): analyzer = self.getAnalyzer() iwconf = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter( SimpleFSDirectory( File(self.luceneDir) ), iwconf) # read through input file and write out to lucene counter = 0 linesReadCounter = 0 with open(inputFile, 'r') as lines: linesRead = 0 for line in lines: try: linesRead+=1 if linesRead % 1000 == 0: print "%d lines read" % linesRead cui, concept = line.replace("\",\"", "\t").replace("\"", "").split("\t") concept = concept.strip() cui = cui.strip() strNorm = self.normalizeCasePunct(concept) strSorted = self.sortWords(strNorm) strStemmed = self.stemWords(strNorm) strStemmedSorted = self.stemWords(strSorted) fdoc = Document() counter +=1 fid = counter fdoc.add( Field("id", unicode(fid), Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("cui", cui, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str", concept, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_norm", strNorm, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_sorted", strSorted, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_stemmed", strStemmed, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_stemmedSorted", strStemmedSorted, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(fdoc) if fid % 1000 == 0: writer.commit() except: "Skipping line: %s" % line writer.commit() writer.close()
def index(analyzer, index_dest_dir, documents): """ Builds Lucene index from provided documents using given analyzer :param analyzer: :param index_dest_dir: :param list[Document] documents: :return: """ if not all([isinstance(d, Document) for d in documents]): raise TypeError("documents should be iterable of type Document! Given: %s" % type(documents[0])) writer_config = IndexWriterConfig(Version.LUCENE_30, analyzer) writer = IndexWriter(SimpleFSDirectory(File(index_dest_dir)), writer_config) for doc in documents: writer.addDocument(doc) writer.close()
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1000)#1048576 config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def rollback(collection_name): if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) writer.rollback() writer.close()
def delete(primary_keys_map,collection_name,todelete,commit=False): INDEX_DIR_DEFAULT="IndexFiles.index" if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs=json.loads(todelete) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) ireader=IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs={} tofind_nonprimary_keyvalue_pairs={} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] #filtering documents according to primary keys query=BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) a=writer.deleteDocuments(query) if commit==True: writer.commit() writer.close() return 000;
def xmlrpc_indexDocument(self, instance, id, text): """Index a new document.""" self.xmlrpc_unindexDocument(instance, id) # Create a document and add two fields to it. doc = Document() doc.add(Field('id', id, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field('text', text, Field.Store.YES, Field.Index.TOKENIZED)) doc.add(Field('instance', instance, Field.Store.YES, Field.Index.UN_TOKENIZED)) # Write the document into the index. writer = IndexWriter(self.indexPath, self.analyzer, 0) writer.addDocument(doc) writer.optimize() writer.close() log('Insert: Instance: %s Document: %s' %(instance, id)) return 1
def __init__(self, destination_directory, analyzer): if not os.path.exists(destination_directory): os.mkdir(destination_directory) store = SimpleFSDirectory(File(destination_directory)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.tweetIndexer(writer) ticker = Ticker() threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, root, storeDir, relationFile): if not os.path.exists(storeDir): os.mkdir(storeDir) # store = SimpleFSDirectory(File(storeDir).toPath()) store = SimpleFSDirectory(Paths.get(storeDir)) # analyzer = StandardAnalyzer() analyzer = WhitespaceAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.read_relation(relationFile) self.indexDocs(root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
def indexar(): directory = SimpleFSDirectory(Paths.get("./lucene/index")) analyzer = SpanishAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 10000) config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) doc_names = os.listdir("./documentos") indexados = 0 for dn in doc_names: d = open("./documentos/" + dn, "r") bs = BeautifulSoup(d, "lxml") d.close() doc = Document() doc.add( Field("id", bs.documento.metadatos.identificador.text, StringField.TYPE_STORED)) doc.add( Field("titulo", bs.documento.metadatos.titulo.text, StringField.TYPE_STORED)) doc.add( Field("pdf", bs.documento.metadatos.url_pdf.text, StringField.TYPE_STORED)) doc.add(Field("texto", bs.documento.texto.text, TextField.TYPE_STORED)) writer.addDocument(doc) indexados += 1 writer.commit() writer.close() return render_template("indexados.html", lucene=lucene.VERSION, indexados=indexados) # @app.route("/slides") # def slides(): # return render_template("slides/index.html")
def indexing(datadir): indexedDocs = 0 doc = Document() #index_outdir = str(input("Enter index output dir: ")) path = Paths.get('indexOut') indexOut = SimpleFSDirectory(path) analyzer = EnglishAnalyzer() config = IndexWriterConfig(analyzer) writer = IndexWriter(indexOut, config) for filename in glob.iglob(datadir + '/*.json*', recursive=True): try: print("Filename is", filename) #pdb.set_trace() with open(filename) as f: for line in f: tweet=json.loads(line) if(tweet['lang']=='en'): doc.add(StringField("id", tweet['id_str'], Field.Store.YES)) # doc.add(Field("screen_name", tweet['user.screen_name'])) # print(tweet['user.screen_name']) # doc.add(Field("name", tweet['user.name'])) #doc.add(Field("location", tweet['user.location'])) #print(tweet['user.location']) doc.add(TextField("text",tweet['text'],Field.Store.YES)) #doc.add(Field("created_at", DateTools.stringToDate(tweet['created_at']),Field.Store.YES)) doc.add(TextField("created_at", tweet['created_at'], Field.Store.YES)) # doc.add(IntPoint("followers", tweet['user.followers_count'],Field.Store.YES)) # doc.add(IntPoint("friends", tweet['friends_count'],Field.Store.YES)) writer.addDocument(doc) writer.commit() indexedDocs+=1 except: continue writer.close() print("Indexed ", indexedDocs, " documents")
def main(src, dst): try: start_time = time.time() print "Indexing starts..." indicesDestination = File(dst) #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) #Analyzer : 본문이나 제목 등의 텍스트를 색인하기 전에 반드시 분석기를 거쳐 단어로 분리해야 한다. Analyzer 클래스는 Directory와 함께 IndexWrite 클래스의 생성 메소드에 지정하며 지정된 텍슽트를 색인할 단위 단어로 분리하고 필요 없는 단어를 제거하는 등의 역할을 담당 analyzer = KeywordAnalyzer( ) #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.) a = { "code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT) } #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict()) wrapper_analyzer = PerFieldAnalyzerWrapper( analyzer, a ) #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indicesDestination), config) #SimpleFSDirectory 옵션은 파일시스템에 특정 디렉토리에 인덱스 파일을 저장하겠다. DB, RAM, File system 3개가 있음 #config 는 IndexWriter 사용에 필요한 Analyzed 된 token이다. counter = Counter() generate_indices_from_projects(src, writer, counter) writer.close() print "Done" print str(counter) print "$$$%s\tseconds" % (time.time() - start_time) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def main(): try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') LUCENE_INDEX_DIR = 'mmapDirectory\\trec_v21_para_uri' is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = SimpleAnalyzer() config = IndexWriterConfig(analyzer) config = config.setRAMBufferSizeMB(512.0) # experimental setting !! # write data to index #if not is_index_Exist: if True: print('begin backup code files') system_flag = platform.system() if system_flag == 'Windows': cmd = 'robocopy %s %s\code_files *.py' % (r'%cd%', LUCENE_INDEX_DIR) os.system(cmd) else: cmd = 'mkdir %s/code_files' % (LUCENE_INDEX_DIR) os.system(cmd) cmd = 'cp -f *.py %s/code_files' % (LUCENE_INDEX_DIR) os.system(cmd) w = IndexWriter(index_mm, config) makeIndex(w) w.close() else: print('index already exists, stop indexing')
def index(cls, indexDir, taxoDir, facets_config): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer config = IndexWriterConfig(WhitespaceAnalyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # obtain the sample facets for current document facets = categories[docNum] author = authors[docNum] # ... and use the FacetField class for adding facet fields to # the Lucene document (and via FacetsConfig to the taxonomy index) doc.add(FacetField("Author", author)) for f in facets: doc.add(FacetField("Categories", f)) # finally add the document to the index iw.addDocument(facets_config.build(taxo, doc)) nDocsAdded += 1 # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. iw.close() taxo.close() print "Indexed %d documents with facets." % nDocsAdded
def __init__(self, root, storeDir): self.root=root if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.store=store self.Analyzer=analyzer self.success=0 self.indexDocs(root, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def build_index(file_dir): indexDir = SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")) config = IndexWriterConfig(WhitespaceAnalyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(indexDir, config) # t1 = FieldType() # t1.setStored(True) # t1.setTokenized(False) # t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) # # t2 = FieldType() # t2.setStored(True) # t2.setTokenized(True) # t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) print("%d docs in index" % writer.numDocs()) if writer.numDocs(): print("Index already built.") return with open(file_dir + "/train/train.ast.src") as fc: codes = [ re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip()) for line in fc.readlines() ] for k, code in enumerate(codes): doc = Document() doc.add(StoredField("id", str(k))) doc.add(TextField("code", code, Field.Store.YES)) writer.addDocument(doc) print("Closing index of %d docs..." % writer.numDocs()) writer.close()
class Indexer(object): """Usage: python IndexFiles <doc_directory>""" def __init__(self, index_dir): print("lucene:", lucene.VERSION) self.index_dir = index_dir store = SimpleFSDirectory(Paths.get(self.index_dir)) analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(store, config) def build_index(self, dict_data): print("loading data...") t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for k, v in dict_data.items(): doc = Document() doc.add(Field("id", k, t1)) doc.add(Field("content", v, t2)) self.writer.addDocument(doc) ticker = Ticker() print("commit index") threading.Thread(target=ticker.run).start() self.writer.commit() self.writer.close() ticker.tick = False print("done")
def main(): try: print "Indexing starts..." indicesDestination = File("/Users/Falcon/Desktop/New_Indices/IJA_Indices") analyzer = KeywordAnalyzer() a = {"code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)} wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indicesDestination), config) counter = Counter() generate_indices_from_projects(writer, counter) writer.close() print "Done" print str(counter) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def index(): indexFile = File(luceneDirectory).toPath() directory = FSDirectory.open(indexFile) analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 128479) writeConfig = IndexWriterConfig(analyzer) writer = IndexWriter(directory, writeConfig) file_number = 2 while file_number <= 200: data = [] file_name = './parsed/parsed_data' + str(file_number) + '.txt' with open(file_name) as f: for line in f: data.append(json.loads(line)) f.close() for j in data: doc = create_doc(j) writer.addDocument(doc) file_number += 1 writer.close()
def create_index(index) : indexDir = SimpleFSDirectory(File(index)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open('f:/nlp/data/questions/combine.txt') for line in f : line = get_data_from_text(line.decode('utf-8')) doc = Document() field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED) field.setBoost(2.0) doc.add(field) writer.addDocument(doc) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def indexer(documents_file): analyzer = StandardAnalyzer() # creating a directory on the RAM directory = RAMDirectory() config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) # indexing the documents doc = Document() lines = documents_file.readlines() length = len(lines) for line_number in range(length): # indexing document ID if lines[line_number].startswith(".U"): doc_id = lines[line_number + 1].strip() writer.addDocument(doc) doc = Document() doc.add(Field("DocID", doc_id, TextField.TYPE_STORED)) # indexing document description elif lines[line_number].startswith(".W"): paragraph = lines[line_number + 1].strip() paragraph = search.stop_words(paragraph) doc.add(Field("DocParagraph", paragraph, TextField.TYPE_STORED)) # indexing document title elif lines[line_number].startswith(".T"): paragraph = lines[line_number + 1].strip() paragraph = search.stop_words(paragraph) doc.add(Field("DocParagraph", paragraph, TextField.TYPE_STORED)) # indexing document keywords elif lines[line_number].startswith(".M"): paragraph = lines[line_number + 1].strip() paragraph = search.stop_words(paragraph) doc.add(Field("DocParagraph", paragraph, TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() return directory, analyzer
class Indexer(): """This class provide functions to index article stored in the database.""" def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Indexer. Parameters ---------- index_dir : string The location of lucene index mode : string The mode when opening lucene index. Available values are: 'create', open new index and overwriting over index, 'append', open existed index and append. 'create_or_append', if `index_dir` exists, 'append', else 'create' date_format : string We save datetime field as string, `date_format` specify how to format datetime into string. """ # self.store = FSDirectory.open(File(index_dir)) self.store = FSDirectory.open(Paths.get(index_dir)) # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer = StandardAnalyzer() # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config = IndexWriterConfig(self.analyzer) self.mode = mode self.date_format = date_format if mode == 'create_or_append': self.config.setOpenMode( IndexWriterConfig.OpenMode.CREATE_OR_APPEND) elif mode == 'create': self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) elif mode == 'append': self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) else: raise ValueError('Invalid mode %s', mode) self.writer = IndexWriter(self.store, self.config) def index_one(self, article): """Create index for one url object in the database. """ try: date_published_str = article['date_published'].strftime( self.date_format) except Exception as e: logger.warning('Error when formating date_published %r: %s ', article['canonical_url'], e) return doc = Document() doc.add(StoredField('group_id', article['group_id'])) doc.add(StoredField('article_id', article['article_id'])) doc.add( StringField('date_published', date_published_str, Field.Store.YES)) doc.add( SortedDocValuesField('date_published', BytesRef(date_published_str))) doc.add(StoredField('date_published', date_published_str)) doc.add(StringField('domain', article['domain'], Field.Store.YES)) doc.add(StringField('site_type', article['site_type'], Field.Store.YES)) doc.add( TextField('canonical_url', article['canonical_url'], Field.Store.YES)) doc.add(TextField('title', article['title'], Field.Store.YES)) doc.add(TextField('meta', article['meta'], Field.Store.NO)) doc.add(TextField('content', article['content'], Field.Store.NO)) doc.add(StoredField('uq_id_str', article['uq_id_str'])) self.writer.addDocument(doc) def close(self): """Close the index writer.""" self.writer.close()
INDEXDIR = SimpleFSDirectory(Paths.get(indexDir)) indexWriter = IndexWriter(INDEXDIR, config) for root, dirnames, filenames in os.walk(docDir): for filename in filenames: print filename url = filename.replace("()", "/").replace(".txt", "") # print url if not filename.endswith('.txt'): continue path = os.path.join(root, filename) path = os.path.abspath(os.path.normpath(path)) with open(path, 'r') as c: contents = unicode(c.read(), 'utf-8') doc = Document() urlField = Field('url', url, TextField.TYPE_STORED) doc.add(urlField) nameField = Field('name', filename, TextField.TYPE_STORED) doc.add(nameField) pathField = Field('path', path, TextField.TYPE_STORED) doc.add(pathField) contentsField = Field('contents', contents, TextField.TYPE_STORED) doc.add(contentsField) indexWriter.addDocument(doc) indexWriter.commit() indexWriter.close() end = datetime.now() print '建立索引花费时间:', (end - start)
def create_index_from_folder(folder, index_file): """Lets Lucene create an index of all database files within a specified folder :param folder: absolute or relative path to database files :param index_file: absolute or relative output location for index Notes: - Does not go through database folder recursively, i.e. all files have to be at the root of the folder - Only CSV files are supported - Column headers are hardcoded and should follow: ID, text, Reddit ID, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold """ # Set up Lucene print() print("Starting Lucene ...") lucene.initVM() index_store = SimpleFSDirectory.open(File(index_file).toPath()) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) print() # Go through files, add rows of each as Documents to writer for file in os.listdir(folder): if file.endswith(".csv"): print("Indexing {} ...".format(file), end=" ", flush=True) with open(os.path.join(folder, file), newline='') as db: reader = csv.reader(db) # The Reddit database seems to carry a lot of duplicate posts, so we try to skip those post_ids = set() duplicate_counter = 0 # To store term vectors (used for query expansion) we have to use a custom fieldtype customfield = FieldType() customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS) customfield.setStored(True) customfield.setTokenized(True) customfield.setStoreTermVectors(True) # CSV files have a useless first row... skipfirst = True # ... and a useless first column. Skip both. for _, text, rid, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold in reader: if skipfirst: skipfirst = False continue doc = Document() if rid in post_ids: duplicate_counter += 1 continue # skip else: post_ids.add(rid) # Tokenize, index and store doc.add(Field("text", text, customfield)) # Index and store doc.add(StringField("id", rid, Field.Store.YES)) doc.add( StringField("subreddit", subreddit, Field.Store.YES)) doc.add(StringField("meta", meta, Field.Store.YES)) doc.add(StringField("time", time, Field.Store.YES)) doc.add(StringField("author", author, Field.Store.YES)) # Store only doc.add(StoredField("ups", ups)) doc.add(StoredField("downs", downs)) doc.add(StoredField("authorlinkkarma", authorlinkkarma)) doc.add(StoredField("authorkarma", authorkarma)) doc.add(StoredField("authorisgold", authorisgold)) writer.addDocument(doc) print("DONE!\t(Duplicate posts skipped: {})".format( duplicate_counter)) writer.commit() writer.close() print() print("Finished indexing!")
from org.apache.lucene.analysis.standard import StandardAnalyzer if __name__ == "__main__": lucene.initVM() path = Paths.get('index') indexDir = SimpleFSDirectory(path) analyzer = StandardAnalyzer() writerConfig = IndexWriterConfig(analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from sys.stdin..." todo = get_all_rawtext_ids() for n, i in enumerate(todo): try: html = get_rawtext_by_id(i).html root = LH.fromstring(html) text = root.text_content().strip() except: #print "Failed to parse doc" continue doc = Document() # print text doc.add(TextField("text", text, Field.Store.NO)) doc.add(StoredField("id", i)) writer.addDocument(doc) if n % 1000 == 0: print "Indexed %d files (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
lucene.initVM(vmargs=['-Djava.awt.headless=true']) analyzer = StandardAnalyzer() # store the index in memory directory = RAMDirectory() # # store the index in File System # directory = FSDirectory() config = IndexWriterConfig(analyzer) iwriter = IndexWriter(directory, config) doc = Document() text = "This is the text to be indexed." doc.add(Field("fieldname", text, TextField.TYPE_STORED)) iwriter.addDocument(doc) iwriter.close() # now search the index ireader = DirectoryReader.open(directory) isearcher = IndexSearcher(ireader) # parse a simple query that searches for "text" parser = QueryParser("fieldname", analyzer) query = parser.parse("text") hits = isearcher.search(query, 1000).scoreDocs for hit in hits: result = isearcher.doc(hit.doc) print(result.get("fieldname")) for txtName in gutenberg_list: words = nltk.corpus.gutenberg.words(txtName)
class LuceneSearch(): def __init__(self): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print 'Creating index at', prm.index_folder if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print 'copying index from', prm.index_folder, 'to', prm.local_index_folder if os.path.exists(prm.local_index_folder): print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print 'Creating index at', prm.index_folder_term self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term if os.path.exists(prm.local_index_folder_term): print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map() def get_title_id_map(self): # get number of docs n_docs = self.searcher.getIndexReader().numDocs() title_id = {} id_title = {} query = MatchAllDocsQuery() hits = self.searcher.search(query, n_docs) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) idd = int(doc['id']) title = doc['title'] title_id[title] = idd id_title[idd] = title return title_id, id_title def add_doc(self, doc_id, title, txt, add_terms): doc = Document() txt = utils.clean(txt) if add_terms: txt_ = txt.lower() words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc) words_idx = words_idx[0] words = words[0] doc.add(Field("id", str(doc_id), self.t1)) doc.add(Field("title", title, self.t1)) doc.add(Field("text", txt, self.t2)) if add_terms: doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3)) doc.add(Field("word", '<&>'.join(words), self.t3)) self.writer.addDocument(doc) def create_index(self, index_folder, docs_path, add_terms=False): print 'Loading Vocab...' if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close() def search_multithread(self, qs, max_cand, max_full_cand, searcher): self.max_cand = max_cand self.max_full_cand = max_full_cand self.curr_searcher = searcher out = self.pool.map(self.search_multithread_part, qs) return out def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() if q in self.cache: return self.cache[q] else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() hits = self.curr_searcher.search(query, self.max_cand) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) if i < self.max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] return c def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher): out = [] for q in qs: if q in self.cache: out.append(self.cache[q]) else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace( 'NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse( QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' query = QueryParser("text", self.analyzer).parse( QueryParser.escape('dummy')) c = OrderedDict() hits = curr_searcher.search(query, max_cand) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) if i < max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] out.append(c) return out def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True): if not max_full_cand: max_full_cand = max_cand if prm.docs_path != prm.docs_path_term: max_cand2 = 0 else: max_cand2 = max_full_cand if prm.n_threads > 1: out = self.search_multithread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term) else: out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term) if (prm.docs_path != prm.docs_path_term) and extra_terms: for outt, termss in itertools.izip(out, terms): for cand_id, term in itertools.izip( outt.keys()[:max_full_cand], termss.values()): outt[cand_id] = term if save_cache: for q, c in itertools.izip(qs, out): if q not in self.cache: self.cache[q] = c return out
class Lucene(object): # default fieldnames for id and contents FIELDNAME_ID = "id" FIELDNAME_CONTENTS = "contents" # internal fieldtypes # used as Enum, the actual values don't matter FIELDTYPE_ID = "id" FIELDTYPE_ID_TV = "id_tv" FIELDTYPE_TEXT = "text" FIELDTYPE_TEXT_TV = "text_tv" FIELDTYPE_TEXT_TVP = "text_tvp" FIELDTYPE_TEXT_NTV = "text_ntv" FIELDTYPE_TEXT_NTVP = "text_ntvp" def __init__(self, index_dir, max_shingle_size=None): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True self.dir = SimpleFSDirectory(File(index_dir)) self.max_shingle_size = max_shingle_size self.analyzer = None self.reader = None self.searcher = None self.writer = None self.ldf = None @staticmethod def get_version(): """Get Lucene version.""" return Version.LUCENE_48 @staticmethod def preprocess(text): """Tokenize and stop the input text.""" ts = StandardTokenizer(Lucene.get_version(), StringReader(text.lower())) ts = StopFilter(Lucene.get_version(), ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET) string_builder = StringBuilder() ts.reset() char_term_attr = ts.addAttribute(CharTermAttribute.class_) while ts.incrementToken(): if string_builder.length() > 0: string_builder.append(" ") string_builder.append(char_term_attr.toString()) return string_builder.toString() def get_analyzer(self): """Get analyzer.""" if self.analyzer is None: std_analyzer = StandardAnalyzer(Lucene.get_version()) if self.max_shingle_size is None: self.analyzer = std_analyzer else: self.analyzer = ShingleAnalyzerWrapper(std_analyzer, self.max_shingle_size) return self.analyzer def open_reader(self): """Open IndexReader.""" if self.reader is None: self.reader = DirectoryReader.open(self.dir) def get_reader(self): return self.reader def close_reader(self): """Close IndexReader.""" if self.reader is not None: self.reader.close() self.reader = None else: raise Exception("There is no open IndexReader to close") def open_searcher(self): """ Open IndexSearcher. Automatically opens an IndexReader too, if it is not already open. There is no close method for the searcher. """ if self.searcher is None: self.open_reader() self.searcher = IndexSearcher(self.reader) def get_searcher(self): """Returns index searcher (opens it if needed).""" self.open_searcher() return self.searcher def set_lm_similarity_jm(self, method="jm", smoothing_param=0.1): """ Set searcher to use LM similarity. :param method: LM similarity ("jm" or "dirichlet") :param smoothing_param: smoothing parameter (lambda or mu) """ if method == "jm": similarity = LMJelinekMercerSimilarity(smoothing_param) elif method == "dirichlet": similarity = LMDirichletSimilarity(smoothing_param) else: raise Exception("Unknown method") if self.searcher is None: raise Exception("Searcher has not been created") self.searcher.setSimilarity(similarity) def open_writer(self): """Open IndexWriter.""" if self.writer is None: config = IndexWriterConfig(Lucene.get_version(), self.get_analyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.dir, config) else: raise Exception("IndexWriter is already open") def close_writer(self): """Close IndexWriter.""" if self.writer is not None: self.writer.close() self.writer = None else: raise Exception("There is no open IndexWriter to close") def add_document(self, contents): """ Adds a Lucene document with the specified contents to the index. See LuceneDocument.create_document() for the explanation of contents. """ if self.ldf is None: # create a single LuceneDocument object that will be reused self.ldf = LuceneDocument() self.writer.addDocument(self.ldf.create_document(contents)) def get_lucene_document_id(self, doc_id): """Loads a document from a Lucene index based on its id.""" self.open_searcher() query = TermQuery(Term(self.FIELDNAME_ID, doc_id)) tophit = self.searcher.search(query, 1).scoreDocs if len(tophit) == 1: return tophit[0].doc else: return None def get_document_id(self, lucene_doc_id): """Gets lucene document id and returns the document id.""" self.open_reader() return self.reader.document(lucene_doc_id).get(self.FIELDNAME_ID) def print_document(self, lucene_doc_id, term_vect=False): """Prints document contents.""" if lucene_doc_id is None: print "Document is not found in the index." else: doc = self.reader.document(lucene_doc_id) print "Document ID (field '" + self.FIELDNAME_ID + "'): " + doc.get( self.FIELDNAME_ID) # first collect (unique) field names fields = [] for f in doc.getFields(): if f.name() != self.FIELDNAME_ID and f.name() not in fields: fields.append(f.name()) for fname in fields: print fname for fv in doc.getValues( fname): # printing (possibly multiple) field values print "\t" + fv # term vector if term_vect: print "-----" termfreqs = self.get_doc_termfreqs(lucene_doc_id, fname) for term in termfreqs: print term + " : " + str(termfreqs[term]) print "-----" def get_lucene_query(self, query, field=FIELDNAME_CONTENTS): """Creates Lucene query from keyword query.""" query = query.replace("(", "").replace(")", "").replace("!", "") return QueryParser(Lucene.get_version(), field, self.get_analyzer()).parse(query) def analyze_query(self, query, field=FIELDNAME_CONTENTS): """ Analyses the query and returns query terms. :param query: query :param field: field name :return: list of query terms """ qterms = [] # holds a list of analyzed query terms ts = self.get_analyzer().tokenStream(field, query) term = ts.addAttribute(CharTermAttribute.class_) ts.reset() while ts.incrementToken(): qterms.append(term.toString()) ts.end() ts.close() return qterms def get_id_lookup_query(self, id, field=None): """Creates Lucene query for searching by (external) document id.""" if field is None: field = self.FIELDNAME_ID return TermQuery(Term(field, id)) def get_and_query(self, queries): """Creates an AND Boolean query from multiple Lucene queries.""" # empty boolean query with Similarity.coord() disabled bq = BooleanQuery(False) for q in queries: bq.add(q, BooleanClause.Occur.MUST) return bq def get_or_query(self, queries): """Creates an OR Boolean query from multiple Lucene queries.""" # empty boolean query with Similarity.coord() disabled bq = BooleanQuery(False) for q in queries: bq.add(q, BooleanClause.Occur.SHOULD) return bq def get_phrase_query(self, query, field): """Creates phrase query for searching exact phrase.""" phq = PhraseQuery() for t in query.split(): phq.add(Term(field, t)) return phq def get_span_query(self, terms, field, slop, ordered=True): """ Creates near span query :param terms: list of terms :param field: field name :param slop: number of terms between the query terms :param ordered: If true, ordered search; otherwise unordered search :return: lucene span near query """ span_queries = [] for term in terms: span_queries.append(SpanTermQuery(Term(field, term))) span_near_query = SpanNearQuery(span_queries, slop, ordered) return span_near_query def get_doc_phrase_freq(self, phrase, field, slop, ordered): """ Returns collection frequency for a given phrase and field. :param phrase: str :param field: field name :param slop: number of terms in between :param ordered: If true, term occurrences should be ordered :return: dictionary {doc: freq, ...} """ # creates span near query span_near_query = self.get_span_query(phrase.split(" "), field, slop=slop, ordered=ordered) # extracts document frequency self.open_searcher() index_reader_context = self.searcher.getTopReaderContext() term_contexts = HashMap() terms = TreeSet() span_near_query.extractTerms(terms) for term in terms: term_contexts.put(term, TermContext.build(index_reader_context, term)) leaves = index_reader_context.leaves() doc_phrase_freq = {} # iterates over all atomic readers for atomic_reader_context in leaves: bits = atomic_reader_context.reader().getLiveDocs() spans = span_near_query.getSpans(atomic_reader_context, bits, term_contexts) while spans.next(): lucene_doc_id = spans.doc() doc_id = atomic_reader_context.reader().document( lucene_doc_id).get(self.FIELDNAME_ID) if doc_id not in doc_phrase_freq: doc_phrase_freq[doc_id] = 1 else: doc_phrase_freq[doc_id] += 1 return doc_phrase_freq def get_id_filter(self): return FieldValueFilter(self.FIELDNAME_ID) def __to_retrieval_results(self, scoredocs, field_id=FIELDNAME_ID): """Converts Lucene scoreDocs results to RetrievalResults format.""" rr = RetrievalResults() if scoredocs is not None: for i in xrange(len(scoredocs)): score = scoredocs[i].score lucene_doc_id = scoredocs[i].doc # internal doc_id doc_id = self.reader.document(lucene_doc_id).get(field_id) rr.append(doc_id, score, lucene_doc_id) return rr def score_query(self, query, field_content=FIELDNAME_CONTENTS, field_id=FIELDNAME_ID, num_docs=100): """Scores a given query and return results as a RetrievalScores object.""" lucene_query = self.get_lucene_query(query, field_content) scoredocs = self.searcher.search(lucene_query, num_docs).scoreDocs return self.__to_retrieval_results(scoredocs, field_id) def num_docs(self): """Returns number of documents in the index.""" self.open_reader() return self.reader.numDocs() def num_fields(self): """Returns number of fields in the index.""" self.open_reader() atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader) return atomic_reader.getFieldInfos().size() def get_fields(self): """Returns name of fields in the index.""" fields = [] self.open_reader() atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader) for fieldInfo in atomic_reader.getFieldInfos().iterator(): fields.append(fieldInfo.name) return fields def get_doc_termvector(self, lucene_doc_id, field): """Outputs the document term vector as a generator.""" terms = self.reader.getTermVector(lucene_doc_id, field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): yield bytesref.utf8ToString(), termenum def get_doc_termfreqs(self, lucene_doc_id, field): """ Returns term frequencies for a given document field. :param lucene_doc_id: Lucene document ID :param field: document field :return dict: with terms """ termfreqs = {} for term, termenum in self.get_doc_termvector(lucene_doc_id, field): termfreqs[term] = int(termenum.totalTermFreq()) return termfreqs def get_doc_termfreqs_all_fields(self, lucene_doc_id): """ Returns term frequency for all fields in the given document. :param lucene_doc_id: Lucene document ID :return: dictionary {field: {term: freq, ...}, ...} """ doc_termfreqs = {} vectors = self.reader.getTermVectors(lucene_doc_id) if vectors: for field in vectors.iterator(): doc_termfreqs[field] = {} terms = vectors.terms(field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): doc_termfreqs[field][bytesref.utf8ToString()] = int( termenum.totalTermFreq()) print doc_termfreqs[field] return doc_termfreqs def get_coll_termvector(self, field): """ Returns collection term vector for the given field.""" self.open_reader() fields = MultiFields.getFields(self.reader) if fields is not None: terms = fields.terms(field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): yield bytesref.utf8ToString(), termenum def get_coll_termfreq(self, term, field): """ Returns collection term frequency for the given field. :param term: string :param field: string, document field :return: int """ self.open_reader() return self.reader.totalTermFreq(Term(field, term)) def get_doc_freq(self, term, field): """ Returns document frequency for the given term and field. :param term: string, term :param field: string, document field :return: int """ self.open_reader() return self.reader.docFreq(Term(field, term)) def get_doc_count(self, field): """ Returns number of documents with at least one term for the given field. :param field: string, field name :return: int """ self.open_reader() return self.reader.getDocCount(field) def get_coll_length(self, field): """ Returns length of field in the collection. :param field: string, field name :return: int """ self.open_reader() return self.reader.getSumTotalTermFreq(field) def get_avg_len(self, field): """ Returns average length of a field in the collection. :param field: string, field name """ self.open_reader() n = self.reader.getDocCount( field) # number of documents with at least one term for this field len_all = self.reader.getSumTotalTermFreq(field) if n == 0: return 0 else: return len_all / float(n)
class LuceneSearch(object): def __init__(self, args): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.args = args index_folder = os.path.join(DATA_DIR, args.index_folder) if not os.path.exists(index_folder): self.doc_db = DocDB() logger.info(f'Creating index at {index_folder}') self.create_index(index_folder) fsDir = MMapDirectory(Paths.get(index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) self.searcher.setSimilarity(MyTFIDFSimilarity()) self.analyzer = MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True)) self.pool = ThreadPool(processes=args.num_search_workers) def add_doc(self, title, text, tokens): doc = Document() doc.add(Field("title", title, self.t1)) doc.add(Field("text", text, self.t2)) doc.add(Field("token", tokens, self.t3)) self.writer.addDocument(doc) def create_index(self, index_folder): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(True) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig( MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True))) writerConfig.setSimilarity(MyTFIDFSimilarity()) writerConfig.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fsDir, writerConfig) logger.info(f"{self.writer.numDocs()} docs in index") logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) tokens = self.doc_db.get_doc_tokens(doc_id) self.add_doc(doc_id, text, tokens) logger.info(f"Indexed {self.writer.numDocs()} docs.") self.writer.forceMerge(1) # to increase search performance self.writer.close() def search_multithread(self, qs, ranker_doc_max, searcher): self.ranker_doc_max = ranker_doc_max self.curr_searcher = searcher out = self.pool.map(self.search_multithread_part, qs) return out def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() try: if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse(QueryParser.escape(q)) except Exception as e: logger.warning(colored(f'{e}: {q}, use query dummy.'), 'yellow') if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse('dummy') doc_scores, doc_titles, doc_texts, doc_words = [], [], [], [] hits = self.curr_searcher.search(query, self.ranker_doc_max) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) doc_score = hit.score doc_title = doc['title'] doc_word = doc['token'].split('<&>') doc_text = doc['text'] doc_scores.append(doc_score) doc_titles.append(doc_title) doc_words.append(doc_word) doc_texts.append(doc_text) if len(doc_scores) == 0: logger.warning( colored( f'WARN: search engine returns no results for query: {q}.', 'yellow')) return doc_scores, doc_titles, doc_texts, doc_words def search_singlethread(self, qs, ranker_doc_max, curr_searcher): out = [] for q in qs: try: if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse( QueryParser.escape(q)) except Exception as e: logger.warning(colored(f'{e}: {q}, use query dummy.'), 'yellow') if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse('dummy') doc_scores, doc_titles, doc_texts, doc_words = [], [], [], [] hits = curr_searcher.search(query, ranker_doc_max) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) doc_score = hit.score doc_title = doc['title'] doc_word = doc['token'].split('<&>') doc_text = doc['text'] doc_scores.append(doc_score) doc_titles.append(doc_title) doc_words.append(doc_word) doc_texts.append(doc_text) if len(doc_scores) == 0: logger.warning( colored( f'WARN: search engine returns no results for query: {q}.', 'yellow')) out.append((doc_scores, doc_titles, doc_texts, doc_words)) return out def batch_closest_docs(self, qs, ranker_doc_max): if self.args.num_search_workers > 1: out = self.search_multithread(qs, ranker_doc_max, self.searcher) else: out = self.search_singlethread(qs, ranker_doc_max, self.searcher) return out def _parse_query(self, field_name, query): ts = self.analyzer.tokenStream("dummy", StringReader(query)) termAtt = ts.getAttribute(CharTermAttribute.class_) ts.reset() tokens = [] while ts.incrementToken(): tokens.append(termAtt.toString()) ts.end() ts.close() booleanQuery = BooleanQuery.Builder() for token in tokens: builder = PhraseQuery.Builder() for i, word in enumerate(token.split(' ')): builder.add(Term(field_name, word), i) pq = builder.build() booleanQuery.add(pq, BooleanClause.Occur.SHOULD) final_query = booleanQuery.build() return final_query
class Indexer: """ Class which will define our indexer which contains the methods of indexing documents. """ ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(CharArraySet(Arrays.asList( ["a", "a's", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "ain't", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "aren't", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's", "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "course", "currently", "d", "definitely", "described", "despite", "did", "didn't", "different", "do", "does", "doesn't", "doing", "don't", "done", "down", "downwards", "during", "e", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "g", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "he's", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", "know", "knows", "known", "l", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "let's", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "p", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "s", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldn't", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there's", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "u", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "uucp", "v", "value", "various", "very", "via", "viz", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome", "well", "went", "were", "weren't", "what", "what's", "whatever", "when", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "who's", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "won't", "wonder", "would", "would", "wouldn't", "x", "y", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours"]), False)) def __init__(self, index_dir): """ :param index_dir: the dir where to store the index. """ self.indexDir = index_dir if not os.path.exists(index_dir): os.mkdir(index_dir) self.analyzer = MyPythonEnglishAnalyzer( stopwords=self.ENGLISH_STOP_WORDS_SET) conf = IndexWriterConfig(self.analyzer) conf.setUseCompoundFile(False) directory = FSDirectory.open(Paths.get(index_dir)) self.writer = IndexWriter(directory, conf) def index_folder(self, folder2index): """ :param folder2index: the folder to be indexed. :return: """ # Browse all the files from root and store the paths files = glob.glob(folder2index + '**/*.xml', recursive=True) num_lines = len(files) print('\n==> Start processing....\n') # Iterate in the files paths list with tqdm(total=num_lines) as pbar: for file in files: pbar.update(1) doc = WikiDocument(file) # this parse the wikipedia page self.index_document(doc) # this indexes the wikipedia page print("\n==> Please wait ...\n") self.writer.commit() print('A total of ' + str(self.writer.getDocStats().numDocs) + ' have been indexed.') self.close() def index_document(self, wiki_doc): """ :param wiki_doc: the document to be indexed. :return: """ # Method that indexes documents i = 0 for section in wiki_doc.sections: doc = Document() doc.add(StringField("id_article", wiki_doc.id, Field.Store.YES)) doc.add(TextField("title_article", wiki_doc.title, Field.Store.YES)) doc.add(StringField("id_section", str( wiki_doc.id) + "_" + str(i), Field.Store.YES)) doc.add(TextField("title_section", section.title, Field.Store.YES)) doc.add(TextField("content_section", section.text, Field.Store.YES)) self.writer.addDocument(doc) i += 1 def close(self): # close the index self.writer.close()
class Indexer(object): """ The index class contains everything that is needed to index files. """ def __init__(self, dest=None): """ create a apache lucene indexer input: dest destination to store index information. If not set, use RAM. """ # where to store information file or ram if dest: _dir = FSDirectory.open(java.io.File(dest)) else: _dir = RAMDirectory() self.directory = _dir # analyser self.analyser = StandardAnalyzer(Version.LUCENE_CURRENT) # index writer cfg = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyser) cfg.setDefaultWriteLockTimeout(6000) self.idx_writer = IndexWriter(self.directory, cfg) def add_document(self, url, field, text): """ add a new document to index writer input: url the url of the target to be indexed field fieldname of the value that will be indexed text text to be indexed """ doc = Document() doc.add(Field('url', url, TextField.TYPE_STORED)) doc.add(Field(field, text, TextField.TYPE_STORED)) self.idx_writer.addDocument(doc) def close_indexer(self): self.idx_writer.close() def search(self, field, text): """ search text within indexed data input: field fieldname of the value that will be indexed text text to search output: hits return a list of hits """ results = [] idx_reader = DirectoryReader.open(self.directory) idx_searcher = IndexSearcher(idx_reader) # parse query parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field, self.analyser) query = parser.parse(text) # search hits = idx_searcher.search(query, 1000).scoreDocs.tolist() for hit in hits: doc = idx_searcher.doc(hit.doc) score = hit.score title = doc.get(field) url = doc.get("url") results.append((score, url, title)) return results
def main(): """Function to index negative situations and retrive based on input sentence""" all_sent_df = pd.read_csv("../data/sentiment_data.csv") neg = all_sent_df[all_sent_df["label"] == 1] all_neg_phrases = list(neg["phrase"]) with open("../data/negSituations.txt", "r") as fpointer: all_neg_situations = fpointer.readlines() all_neg_situations = map(lambda s: s.strip(), all_neg_situations) all_neg_phrases = map(lambda s: s.strip(), all_neg_phrases) lucene.initVM() analyzer = StandardAnalyzer() path = Paths.get('negSituationIndex') directory = SimpleFSDirectory(path) writer_config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, writer_config) print(writer.numDocs()) # INDEXING ALL DOCUMENTS/ARTICLES IN THE CORPUS for each in all_neg_situations: document = Document() document.add(Field("negativeSituations", each, TextField.TYPE_STORED)) writer.addDocument(document) print(writer.numDocs()) writer.close() analyzer = StandardAnalyzer() reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) # QUERYING FOR A QUESTION with open("../data/negative_situation_to_retrieve.txt", "r") as fpointer: all_test_sent = fpointer.readlines() all_test_sent = map(lambda s: s.strip(), all_test_sent) query_parser = QueryParser("negativeSituations", analyzer) total_num = 0 tic = time.time() all_ans = [] for each in all_test_sent: total_num = total_num + 1 if total_num % 1000 == 0: print(total_num, time.time() - tic) query = query_parser.parse(query_parser.escape(each)) hits = searcher.search(query, 3) docs_scores = [hit.score for hit in hits.scoreDocs] current_ans = [] if docs_scores != []: for hit in hits.scoreDocs: doc_t = searcher.doc(hit.doc) doc_text = doc_t.get("negativeSituations") current_ans.append(doc_text) else: continue current_ans = list(set(current_ans)) all_ans.append(current_ans) print(all_ans)
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher1 = IndexSearcher(reader) searcher1.setSimilarity(BM25Similarity()) searcher2 = IndexSearcher(reader) w = IndexWriter(index_mm,config) # read query read_query() # initialize mongodb client mongoObj=Mongo_Object('localhost',27017) # search docDup=set() finalDup={} for i in xrange(len(queries)): print 'process query %d' %(i) query = queries[i] querystr = stemSentence(query[3]) # build searcher q_lucene = QueryParser("all_text", analyzer).parse(querystr) collector = TopScoreDocCollector.create(hitsPerPage); searcher1.search(q_lucene, collector); hits = collector.topDocs().scoreDocs; # find candidate results after 1st round filter docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) if d['title'] in docDup: finalDup[d['title']]=d continue docDup.add(d['title']) docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) title=d['title'] if d['title'] in docDup: continue docDup.add(title) item=(mongoObj.conn_me).find_one({'title':title}) if item is None: continue entitylist=item['entitylist'].split('|') for en_title in entitylist: if title==en_title: continue t=Term('title',en_title) q=TermQuery(t) docs=searcher2.search(q,2) if docs.totalHits<=1: continue docID2=(docs.scoreDocs)[0].doc doc=searcher2.doc(docID2) finalDup[doc['title']]=doc print 'begin to clean index, there are %d dup records' %(len(finalDup)) for title in finalDup: doc=finalDup[title] # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract name=doc['name'] value=doc['value'] category=doc['category'] skos_category=doc['skos_category'] all_text=doc['all_text'] raw_name=doc['raw_name'] raw_value=doc['raw_value'] abstract=doc['abstract'] print 'process '+title t=Term('title',title) q=TermQuery(t) w.deleteDocuments(q) addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract) # process remaining records #global batch,cnt_batch #if cnt_batch>0: #w.addDocuments(batch) #cnt_batch=0 #del batch[:] w.close()
class Indexer(object): # Creates index adds it to docs # indexDir Directory is where the index is created def __init__(self, indexDir): f = Paths.get(indexDir) self._dir = SimpleFSDirectory(f) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self._writer = IndexWriter(self._dir, config) def close(self): self._writer.close() def getDoc(self, file): try: f = open(os.getcwd()+FILE_DIR+'/'+file, "r") try: c = [] s = BeautifulSoup(f, 'html.parser') text = s.findAll(text=True) c = filter(tag_vis, text) try: c = ' '.join(c) except Exception as e: c = b' '.join(c) except Exception as e: print(str(e)) return content = TextField("contents", c, Field.Store.YES) fileName = str(Paths.get(file)).split('/')[-1] fileName = fileName[:fileName.find(".")] filename = TextField("filename", fileName, Field.Store.YES) path = TextField("filepath", str(os.getcwd()+FILE_DIR+'/'+file), Field.Store.NO) doc = Document() doc.add(content) doc.add(filename) doc.add(path) return doc except Exception as e: print(type(Exception).__name__) print(str(e)) return def indexFile(self, file): if ( self.getDoc(file) is not None ): self._writer.addDocument(self.getDoc(file)) #pass in absolute path when calling this function def createIndex(self, path): for file in os.listdir(path): print(file) if os.path.isfile(path+"/"+file): self.indexFile(file) return self._writer.numDocs() def closeWriter(self): self._writer.close()
def indexDocs(storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(Paths.get(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) root = "wiki-pages-text/" t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(top=root): print(root, dirnames, filenames) for filename in filenames: if not filename.endswith('.txt'): continue print("adding " + filename) try: path = os.path.join(root, filename) file = open(path, encoding="utf8") i = 0 #contents = file.read() while True: i += 1 line = file.readline() doc = Document() if not line: break termName = line.split()[0] + ' ' + line.split()[1] doc.add(Field("name", filename, t1)) doc.add(Field("line", i, t1)) doc.add(Field("termName", termName, t2)) doc.add(Field("content", line.replace(termName, ''), t2)) writer.addDocument(doc) file.close() """ doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print ("warning: no content in " + filename) writer.addDocument(doc) """ except Exception as e: print("Failed in indexDocs:" + str(e)) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
def lucene_indexing(): lucene.initVM() whole_tokenized_db_cursor = wiki_db_tool.get_cursor( config.WHOLE_PROCESS_FOR_RINDEX_DB) whole_tokenized_db_cursor.execute("SELECT * from unnamed") indexDir = SimpleFSDirectory(Paths.get(str(config.LUCENE_INDEXED))) analyzer = PorterStemmerAnalyzer() writerConfig = IndexWriterConfig(analyzer) writer = IndexWriter(indexDir, writerConfig) lprint("Building lucene index ...") with SqliteDict(str(config.WHOLE_WIKI_DB), flag='r', encode=json.dumps, decode=json.loads) as whole_wiki_db: for key, value in tqdm(whole_tokenized_db_cursor, total=config.TOTAL_ARTICLE_NUMBER_WHOLE): item = json.loads(value) article_title = item['title'] article_clean_text = item['clean_text'] article_poss = item['poss'] # TODO: change it to extract abstract wiki? # get the first paragraph which has the length >= 50? so weired. abs_index = get_first_paragraph_index(whole_wiki_db[article_title]) if abs_index == -1: # document too short valid_page = False # only title title_term_list = [] title_poss_list = [] # only abstract content abstract_term_list = [] abstract_poss_list = [] assert len(article_clean_text) == len(article_poss) for p_i, (paragraph_text, paragraph_poss) in enumerate( zip(article_clean_text, article_poss)): for sent_text, sent_poss in zip(paragraph_text, paragraph_poss): if p_i == 0: # In title. title_term_list.extend(sent_text) title_poss_list.extend(sent_poss) continue # If the terms are in title, we don't include those terms in abstract and article term. else: if p_i == abs_index: # If the terms are in abstract abstract_term_list.extend(sent_text) abstract_poss_list.extend(sent_poss) added_title = article_title added_text = " ".join(title_term_list + abstract_term_list) doc = Document() doc.add(Field("title", added_title, StoredField.TYPE)) doc.add(Field("text", added_text, TextField.TYPE_STORED)) writer.addDocument(doc) writer.close()
class LuceneSearch(): """Index and search docs. Parameters ---------- index_dir : str Index of the documents produced by Lucene db_path: str File path of the SQLlite database containing articles of wikipedia dump.(from DrQA) num_search_workers: int (optional), default=8 Workers to use to accelerate searching. """ def __init__(self, index_dir: str, db_path: str = None, num_search_workers: int = 8) -> None: self.env = lucene.getVMEnv() # pylint: disable=no-member if not self.env: self.env = lucene.initVM( initialheap='28g', # pylint: disable=no-member maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.num_search_workers = num_search_workers if not os.path.exists(index_dir): self.doc_db = DocDB(db_path=db_path) logger.info('Creating index at %s', index_dir) self._create_index(index_dir) fs_dir = MMapDirectory(Paths.get(index_dir)) self.searcher = IndexSearcher(DirectoryReader.open(fs_dir)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=num_search_workers) def _create_index(self, index_dir: str) -> None: """Index documents Parameters ---------- index_dir : str The dir to store index """ os.mkdir(index_dir) TITLE_FIELD = FieldType() # pylint: disable=invalid-name TITLE_FIELD.setStored(True) TITLE_FIELD.setIndexOptions(IndexOptions.DOCS) TEXT_FIELD = FieldType() # pylint: disable=invalid-name TEXT_FIELD.setStored(True) TEXT_FIELD.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) fs_dir = MMapDirectory(Paths.get(index_dir)) writer_config = IndexWriterConfig(StandardAnalyzer()) writer_config.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fs_dir, writer_config) logger.info("%d docs in index", self.writer.numDocs()) logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) doc = Document() doc.add(Field("title", doc_id, TITLE_FIELD)) doc.add(Field("text", text, TEXT_FIELD)) self.writer.addDocument(doc) logger.info("Indexed %d docs.", self.writer.numDocs()) self.writer.forceMerge(1) # to increase search performance self.writer.close() def _search_multithread( self, queries: List[str], doc_max: int) -> List[List[Dict[str, Union[float, str]]]]: args = [(query, doc_max) for query in queries] queries_results = self.pool.starmap(self._search_multithread_part, args) return queries_results def _search_multithread_part( self, query: str, doc_max: int) -> List[Dict[str, Union[float, str]]]: if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() try: query = QueryParser('text', self.analyzer).parse(QueryParser.escape(query)) except Exception as exception: # pylint: disable=broad-except logger.warning(colored(f'{exception}: {query}, use query dummy.'), 'yellow') query = QueryParser('text', self.analyzer).parse('dummy') query_results = [] hits = self.searcher.search(query, doc_max) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) query_results.append({ 'score': hit.score, 'title': doc['title'], 'text': doc['text'] }) if not query_results: logger.warning( colored( f'WARN: search engine returns no results for query: {query}.', 'yellow')) return query_results def _search_singlethread( self, queries: List[str], doc_max: int) -> List[List[Dict[str, Union[float, str]]]]: queries_result = [] for query in queries: try: query = QueryParser('text', self.analyzer).parse( QueryParser.escape(query)) except Exception as exception: # pylint: disable=broad-except logger.warning( colored(f'{exception}: {query}, use query dummy.'), 'yellow') query = QueryParser('text', self.analyzer).parse('dummy') query_results = [] hits = self.searcher.search(query, doc_max) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) query_results.append({ 'score': hit.score, 'title': doc['title'], 'text': doc['text'] }) if not query_results: logger.warning( colored( f'WARN: search engine returns no results for query: {query}.', 'yellow')) queries_result.append(query_results) return queries_result def search(self, query: str, doc_max: int = 20) -> List[Dict[str, Union[float, str]]]: """Search a given query. Parameters ---------- query : str Anything you want to search doc_max : int Maximum number of result to return Returns ------- Tuple[Any] Search results. """ return self.batch_search([query], doc_max=doc_max)[0] def batch_search( self, queries: List[str], doc_max: int = 20) -> List[List[Dict[str, Union[float, str]]]]: """ Search a list of queries. Parameters ---------- queries : List[str] queries list doc_max : int, optional, default=20 maximum number of docs returned by the search engine. Returns ------- List[Tuple[Any]] Result returned by the search engine. """ if self.num_search_workers > 1: result = self._search_multithread(queries, doc_max) else: result = self._search_singlethread(queries, doc_max) return result @staticmethod def pprint(search_result: List[Dict[str, Union[float, str]]]) -> None: """Print the results returned by the doc searcher. Parameters ---------- search_result : List[Dict[str, Union[float, str]]] Results returned from ranker """ headers = ['Rank', 'Title', 'Text', 'Score'] table = prettytable.PrettyTable(headers) for i, result in enumerate(search_result): text, title = result['text'], result['title'] text = text[:100] + ' ...' if len(text) > 100 else text title = title[:30] + ' ...' if len(title) > 30 else title table.add_row([i, title, text, '%.5g' % result['score']]) print('Top Results:') print(table)
def buildIndex(self, inputFile): analyzer = self.getAnalyzer() iwconf = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(SimpleFSDirectory(File(self.luceneDir)), iwconf) # read through input file and write out to lucene counter = 0 linesReadCounter = 0 with open(inputFile, 'r') as lines: linesRead = 0 for line in lines: try: linesRead += 1 if linesRead % 1000 == 0: print "%d lines read" % linesRead cui, concept = line.replace("\",\"", "\t").replace("\"", "").split("\t") concept = concept.strip() cui = cui.strip() strNorm = self.normalizeCasePunct(concept) strSorted = self.sortWords(strNorm) strStemmed = self.stemWords(strNorm) strStemmedSorted = self.stemWords(strSorted) fdoc = Document() counter += 1 fid = counter fdoc.add( Field("id", unicode(fid), Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("cui", cui, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str", concept, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_norm", strNorm, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_sorted", strSorted, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_stemmed", strStemmed, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_stemmedSorted", strStemmedSorted, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(fdoc) if fid % 1000 == 0: writer.commit() except: "Skipping line: %s" % line writer.commit() writer.close()