예제 #1
0
def build_index():

    lucene.initVM()

    # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/'
    post_dir = '/Users/w3/data/github/codeif_backup'
    index_store_dir = current_app.config['INDEX_STORE_DIR']
    print post_dir
    print index_store_dir

    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    store = SimpleFSDirectory(File(index_store_dir))
    analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)

    indexDocs(post_dir, writer)
    ticker = Ticker()
    print 'commit index',
    threading.Thread(target=ticker.run).start()
    writer.commit()
    writer.close()
    ticker.tick = False
    print 'done'
예제 #2
0
    def __init__(self, root, store_dir):

        if not os.path.exists(store_dir):
            os.mkdir(store_dir, 0777)


        # NOTE: Hardcoded the analyzer instead of passing it
        lucene.initVM()
        '''
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        '''
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        store = SimpleFSDirectory(File(store_dir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)

        # Set the permissions to 777 for the index directory and the write.lock file
        chmod_indexdir_cmd = "chmod 0777 " + store_dir
        writelock_file = store_dir + "/" + "write.lock"
        chmod_writelock_cmd = "chmod 0777 " + writelock_file

        if os.path.exists(store_dir):
            cicmd=os.popen("sudo -S %s"%(chmod_indexdir_cmd), 'w').write('vagrant')

        if os.path.exists(writelock_file):
            cwcmd=os.popen("sudo -S %s"%(chmod_writelock_cmd), 'w').write('vagrant')

        # setting CREATE will rewrite over the existing indexes.
        ###config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        writer.close()
예제 #3
0
	def __init__(self,root,storeDir,analyzer):
		# Create the index dir if it does not exist 
		if not os.path.exists(storeDir):
			os.mkdir(storeDir)
		# the SimpleFSDirectory which the index will be written in
		store = SimpleFSDirectory(File(storeDir))
		analyzer = LimitTokenCountAnalyzer(analyzer,1048576)
		config = IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
		# create a index writer 
		# atach the index dir and config info to it
		writer = IndexWriter(store,config)

		# call the indexing procedure
		# indexing all the files in the directory specified by root
		# write the index with writer
		self.indexDocs(root,writer)
		# start a ticker
		ticker = Ticker()
		print 'commit index'
		threading.Thread(target=ticker.run).start()
		writer.commit()
		writer.close()
		# stop the ticker when the indexing procedure completes
		ticker.tick = False
		print 'Done'
def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()
예제 #5
0
 def reindex(self):
     writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
     indexutils.reindex_all(self.reader, writer, self.corpus.analyzer)
     writer.optimize()
     writer.close()
     self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)})
     self.parent.write({'status': "Ready!"})
class LuceneIndexer:

    def __init__(self, path_to_save):
        self.path_to_save = path_to_save
        self.num_docs = 0
        lucene.initVM()
        self.indexDir = SimpleFSDirectory(File(self.path_to_save))
        self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
        self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
        self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2)
        self.writer = IndexWriter(self.indexDir, self.writerConfig)

    def add_document(self, fields, header, id_):
        doc = Document()
        if len(fields) > len(header):
            sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_))
            for field in fields:
                sys.stderr.write('%s\n' % field)
            return
        for idx, field in enumerate(fields):
            fname, fieldtype = header[idx]
            if fieldtype is IntField:
                field = int(field)
            doc.add(fieldtype(fname, field, Field.Store.YES))
        self.writer.addDocument(doc)
        self.num_docs += 1

    def close(self):
        print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs())
        self.writer.close()
예제 #7
0
 def _getLucene(self, path):
     directory = FSDirectory.open(Paths.get(path))
     config = IndexWriterConfig(None)
     config.setRAMBufferSizeMB(256.0) # faster
     config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later
     writer = IndexWriter(directory, config)
     reader = writer.getReader()
     searcher = IndexSearcher(reader)
     return writer, reader, searcher
예제 #8
0
def getLucene(path):
    directory = FSDirectory.open(Paths.get(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setIndexSort(Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher
예제 #9
0
def wikipedia_indexer(storage, wikipedia_file) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open(wikipedia_file)

	for i, line in enumerate(f) :
		text = line.strip().decode('utf-8').split('\t')
		title = text[0]
		if 'disambigu' in text[0] or len(text) < 2:
			continue
		text = text[1]
		doc = Document()
		doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
		doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
		doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
		writer.addDocument(doc)
		if writer.numDocs() % 1000 == 0 :
			print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
		
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()	
예제 #10
0
def getLucene(path):
    directory = FSDirectory.open(File(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(Version.LATEST, analyzer)
    mergePolicy = config.getMergePolicy()
    sortingMergePolicy = SortingMergePolicy(mergePolicy, Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    config.setMergePolicy(sortingMergePolicy)
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher
예제 #11
0
def make_index(indexed_data, index_destination, source='directory'):
    #index wiki articles based on ck 12 topics
    #analyzer = StandardAnalyzer(Version.LUCENE_30)
    analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET)
    indexWriterConfig = IndexWriterConfig(Version.LUCENE_30, analyzer)
    writer = IndexWriter(SimpleFSDirectory(File(index_destination)), indexWriterConfig)
    if source == 'directory':
        indexDirectory(indexed_data, writer)
    else:
        indexDictionary(indexed_data, writer)
    writer.close()
예제 #12
0
class Indexer(object):
    def __init__(self, **kwargs):
        """ Initialize a new instance of the Indexer

        :param output: The output directory of the underlying index
        :param anaylzer: The overloaded analyzer to work with
        """
        self.output = kwargs.get("root", "index")
        if not os.path.exists(self.output):
            os.mkdir(self.output)

        self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.store = SimpleFSDirectory(File(self.output))
        self.writer = IndexWriter(self.store, self.config)
        self.create_field_types()

    def index(self, document):
        """ Given a new document, add it to the index.

        :param document: The document to add to the indexer
        """
        try:
            self.writer.addDocument(document)
        except Exception:
            logger.exception("Failed to index the supplied document")

    def shutdown(self):
        """ Shutdown the currently processing indexer.
        """
        try:
            # self.writer.optimize()
            self.writer.close()
        except Exception:
            logger.exception("Failed to shutdown the indexer correctly")

    def create_field_types(self):
        """ Create the field types that will be used to specify
        what actions lucene should take on the various fields
        supplied to index.
        """
        self.field_clean = FieldType()
        self.field_clean.setIndexed(True)
        self.field_clean.setStored(True)
        self.field_clean.setTokenized(False)
        self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        self.field_dirty = FieldType()
        self.field_dirty.setIndexed(True)
        self.field_dirty.setStored(False)
        self.field_dirty.setTokenized(True)
        self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
예제 #13
0
    def __init__(self, root, storeDir, analyzer):
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        writer.commit()
        writer.close()
예제 #14
0
 def import_csv_with_content(self, csv_file, content_field):
     try:
         writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
         changed_rows = addmetadata.add_metadata_and_content_from_csv(self.searcher, self.reader, writer, csv_file, content_field, self.args_dir)
         writer.close()
     except UnicodeDecodeError:
         try:
             writer.close()
         except:
             pass
         self.parent.write({'error': 'CSV import failed: file contained non-unicode characters. Please save the file with UTF-8 encoding and try again!'})
         return
     self.parent.write({'message': "CSV import complete: %s rows added." % (changed_rows,)})
예제 #15
0
    def __init__(self, indexPath):
        """Instantiate the handler object."""
        self.indexPath = indexPath
        self.analyzer = StopAnalyzer()
        
        # Make sure the path exists
        if not os.path.exists(self.indexPath):
            os.mkdir(self.indexPath)

        if not os.path.exists(os.path.join(self.indexPath, 'segments.gen')):
            log('Creating new index.')
            writer = IndexWriter(self.indexPath, self.analyzer, 1)
            writer.close()
예제 #16
0
def create_index(storage, paths) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	import os
	for path in paths :
		for filen in os.listdir(path) :
			text = sent_tokenize(get_data_from_file(path + filen))
			total_sent = len(text)
			for i in range(0, total_sent, 3) :
				doc = Document()
				a = i-5 if i-5 > 0 else 0
				sentence = ' '.join(text[a:i+5])
				doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			print("Done %s" % (path+filen))
			print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
def lucene_index(texts):
    """

    :param corpus_file_path:
    :param f_type:
    :return:
    """
    index = set_lucene_index['ind']  # nonlocal variable index
    config = IndexWriterConfig(version, analyzer)
    writer = IndexWriter(index, config)

    for t in texts:
        addDoc(writer, t)
    writer.close()
예제 #18
0
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
예제 #19
0
def index(analyzer, index_dest_dir, documents):
    """ Builds Lucene index from provided documents using given analyzer
    :param analyzer:
    :param index_dest_dir:
    :param list[Document] documents:
    :return:
    """
    if not all([isinstance(d, Document) for d in documents]):
        raise TypeError("documents should be iterable of type Document! Given: %s" % type(documents[0]))

    writer_config = IndexWriterConfig(Version.LUCENE_30, analyzer)
    writer = IndexWriter(SimpleFSDirectory(File(index_dest_dir)), writer_config)
    for doc in documents:
        writer.addDocument(doc)
    writer.close()
예제 #20
0
    def __init__(self, path, settings):
        self._settings = settings
        self._multithreaded = settings.multithreaded
        self._checker = DirectSpellChecker()
        indexDirectory = MMapDirectory(File(join(path, 'index')))
        indexDirectory.setUseUnmap(False)
        taxoDirectory = MMapDirectory(File(join(path, 'taxo')))
        taxoDirectory.setUseUnmap(False)
        conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer)
        conf.setSimilarity(settings.similarity)
        mergePolicy = TieredMergePolicy()
        mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce)
        mergePolicy.setSegmentsPerTier(settings.segmentsPerTier)
        conf.setMergePolicy(mergePolicy)

        if not settings.readonly:
            self._indexWriter = IndexWriter(indexDirectory, conf)
            self._indexWriter.commit()
            self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize))
            self._taxoWriter.commit()

        self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory)
        self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper

        self._facetsConfig = settings.fieldRegistry.facetsConfig

        self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())
예제 #21
0
 def __init__(self, root, storeDir, analyzer): 
     if not os.path.exists(storeDir): 
         os.mkdir(storeDir) 
     store = SimpleFSDirectory(File(storeDir)) 
     analyzer = LimitTokenCountAnalyzer(analyzer, 1000)#1048576 
     config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) 
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) 
     writer = IndexWriter(store, config) 
     self.indexDocs(root, writer) 
     ticker = Ticker() 
     print 'commit index', 
     threading.Thread(target=ticker.run).start() 
     writer.commit() 
     writer.close() 
     ticker.tick = False 
     print 'done'
def rollback(collection_name):
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT

	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)

	#setting writer configurations
	config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
	config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
	writer=IndexWriter(direc,config)

	writer.rollback()
	writer.close()
예제 #23
0
	def __init__(self, writerConfig, indexDir):
		
		lucene.initVM()

		self.mIndexDir = SimpleFSDirectory(File(indexDir))
		self.mConfig = writerConfig
		self.mWriter = IndexWriter(self.mIndexDir, self.mConfig)
 def open_writer(self):
     """Open IndexWriter."""
     if self.writer is None:
         config = IndexWriterConfig(self.get_version(), self.get_analyzer())
         config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
         self.writer = IndexWriter(self.dir, config)
     else:
         raise Exception("IndexWriter is already open")
예제 #25
0
 def buildIndex(self, inputFile):
     analyzer = self.getAnalyzer()
     iwconf = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
     
     iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter( SimpleFSDirectory( File(self.luceneDir) ), iwconf)
     
     # read through input file and write out to lucene
     counter = 0
     linesReadCounter = 0
     
     with open(inputFile, 'r') as lines:
         linesRead = 0
         
         for line in lines:
             try:
                 linesRead+=1
                 
                 if linesRead % 1000 == 0:
                     print "%d lines read" % linesRead
                     
                 cui, concept = line.replace("\",\"", "\t").replace("\"", "").split("\t")
                 concept = concept.strip()
                 cui = cui.strip()
                 
                 strNorm = self.normalizeCasePunct(concept)
                 strSorted = self.sortWords(strNorm)
                 strStemmed = self.stemWords(strNorm)
                 strStemmedSorted = self.stemWords(strSorted)
       
                 fdoc = Document()
                 
                 counter +=1
                 fid = counter
                 
                 fdoc.add( Field("id", unicode(fid), Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("cui", cui, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str", concept, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_norm", strNorm, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_sorted", strSorted, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_stemmed", strStemmed, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_stemmedSorted", strStemmedSorted, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 writer.addDocument(fdoc)
                 if fid % 1000 == 0:
                     writer.commit()
             except:
                 "Skipping line: %s" % line
                 
     writer.commit()
     writer.close()
    def __init__(self, destination_directory, analyzer):

        if not os.path.exists(destination_directory):
            os.mkdir(destination_directory)

        store = SimpleFSDirectory(File(destination_directory))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.tweetIndexer(writer)
        ticker = Ticker()
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
 def __init__(self, path_to_save):
     self.path_to_save = path_to_save
     self.num_docs = 0
     lucene.initVM()
     self.indexDir = SimpleFSDirectory(File(self.path_to_save))
     self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
     self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
     self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2)
     self.writer = IndexWriter(self.indexDir, self.writerConfig)
예제 #28
0
 def __init__(self, store_dir):
     self.store_dir = store_dir
     if not os.path.exists(store_dir):
         os.mkdir(store_dir, 0777)
     self.store = SimpleFSDirectory(Paths.get(store_dir))
     self.analyzer = StandardAnalyzer()
     self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
     self.config = IndexWriterConfig(self.analyzer)
     self.writer = IndexWriter(self.store, self.config)
예제 #29
0
    def __init__(self, fileRoot, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store    = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config   = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setSimilarity(similarities.BM25Similarity())
    #Available similarity: BM25Similarity, MultiSimilarity, PerFieldSimilarityWrapper, SimilarityBase, TFIDFSimilarity
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer   = IndexWriter(store, config)

        self.indexDocs(fileRoot, writer)
        print 'commit index',
        writer.commit()
        writer.close()
        print 'done'
예제 #30
0
    def createIndexWriter(self, actual_dir, max_field_length=20000000):
        """
            Returns an IndexWriter object created for the actual_dir specified
        """
        ensureDirExists(actual_dir)
        index = SimpleFSDirectory(File(actual_dir))
        analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)

        writerConfig=IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer)
        similarity=FieldAgnosticSimilarity()

        writerConfig.setSimilarity(similarity)
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

    ##    res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length))
        res= IndexWriter(index, writerConfig)
        res.deleteAll()
        return res
sheet1 = wb.sheet_by_index(1)

print('initializing Lucene VM')
lucene.initVM()
print('lucene version ', lucene.VERSION)

index_path = Paths.get('./lucene.index')
question_field = 'question'
answer_field = 'answer'

index_store = SimpleFSDirectory(index_path)
# analyzer = StandardAnalyzer()
analyzer = PersianAnalyzer()
config = IndexWriterConfig(analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(index_store, config)

TokenizeFields = True

# Question field type

qft = FieldType()
# qft.setIndexed(True)  # todo
qft.setStored(True)
qft.setTokenized(TokenizeFields)
qft.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

# Answer field type
aft = FieldType()
# aft.setIndexed(False)  # todo
aft.setStored(True)
예제 #32
0
    doc.add(
        Field("keywords", ' '.join((command, name, synopsis, description)),
              TextField.TYPE_NOT_STORED))
    doc.add(Field("filename", os.path.abspath(path), StringField.TYPE_STORED))

    writer.addDocument(doc)


if __name__ == '__main__':

    if len(sys.argv) != 2:
        print("Usage: python manindex.py <index dir>")

    else:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        directory = SimpleFSDirectory(Paths.get(sys.argv[1]))
        analyzer = StandardAnalyzer()
        analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
        config = IndexWriterConfig(analyzer)
        writer = IndexWriter(directory, config)

        manpath = os.environ.get('MANPATH', '/usr/share/man').split(os.pathsep)
        for dir in manpath:
            print("Crawling", dir)
            for name in os.listdir(dir):
                path = os.path.join(dir, name)
                if os.path.isdir(path):
                    indexDirectory(path)
        writer.commit()
        writer.close()
예제 #33
0
# indexWriterConfig = IndexWriterConfig(Version.LUCENE_30, analyzer)
# writer = IndexWriter(SimpleFSDirectory(File("data/index/ck12_books_paragraphs")), indexWriterConfig)
# indexDictionary(docs, writer)
#
# writer.close()

dir_name = 'data/ck12_book/OEBPS'
docs = {}
html_paths = [os.path.join(dir_name, str(i + 1) + '.html') for i in range(124)]
#
for f_name in html_paths:
    docs.update(book_processing.get_h_all_text(open(f_name).read()))

for fname in os.listdir("data/allwiki"):
    content = open(os.path.join("data", "allwiki", fname)).read()
    i = 0
    for doc in content.split("\n"):
        if len(doc.strip()) > 0:
            docs["%s%d" % (fname, i)] = doc
            i += 1

#analyzer = StandardAnalyzer(Version.LUCENE_30)
analyzer = SnowballAnalyzer(Version.LUCENE_30, "English",
                            StandardAnalyzer.STOP_WORDS_SET)
indexWriterConfig = IndexWriterConfig(Version.LUCENE_30, analyzer)
writer = IndexWriter(SimpleFSDirectory(File("data/index/combo4")),
                     indexWriterConfig)
indexDictionary(docs, writer)

writer.close()
luceneIndexPath = '/home/tarun/PE/lucene/luceneIndexDirectoryNewCorpus60/'
corpus = '/home/tarun/PE/newCorpus60/'
trainingFilePath = '/home/tarun/PE/Dataset/training_set.tsv'

lucene.initVM()

# ANALYZER
analyzer = StandardAnalyzer(util.Version.LUCENE_CURRENT) 

# DIRECTORY
directory = SimpleFSDirectory(File(luceneIndexPath))


# INDEX WRITER
writerConfig = IndexWriterConfig(util.Version.LUCENE_CURRENT, analyzer) 
writer = IndexWriter(directory, writerConfig)

print writer.numDocs()
# INDEXING ALL DOCUMENTS/ARTICLES IN THE CORPUS
for fileName in os.listdir(corpus):
	#print fileName
	document = Document()
	article = os.path.join(corpus, fileName)
	content = open(article, 'r').read()
	document.add(Field("text", content, Field.Store.YES, Field.Index.ANALYZED))
	writer.addDocument(document)
print writer.numDocs()
writer.close()

# INDEX READER
reader = IndexReader.open(directory)
예제 #35
0
 def __init__(self, indexDir):
     self.directory = SimpleFSDirectory(Paths.get(indexDir))
     self.analyzer = StandardAnalyzer()
     # analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
     self.config = IndexWriterConfig(self.analyzer)
     self.writer = IndexWriter(self.directory, self.config)
예제 #36
0
class Index(object):
    def __init__(self, path, settings):
        self._settings = settings
        self._multithreaded = settings.multithreaded
        self._checker = DirectSpellChecker()
        indexDirectory = MMapDirectory(File(join(path, 'index')))
        indexDirectory.setUseUnmap(False)
        taxoDirectory = MMapDirectory(File(join(path, 'taxo')))
        taxoDirectory.setUseUnmap(False)
        conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer)
        conf.setSimilarity(settings.similarity)
        mergePolicy = TieredMergePolicy()
        mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce)
        mergePolicy.setSegmentsPerTier(settings.segmentsPerTier)
        conf.setMergePolicy(mergePolicy)

        if not settings.readonly:
            self._indexWriter = IndexWriter(indexDirectory, conf)
            self._indexWriter.commit()
            self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize))
            self._taxoWriter.commit()

        self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory)
        self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper

        self._facetsConfig = settings.fieldRegistry.facetsConfig

        self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())

    def addDocument(self, term, document):
        document = self._facetsConfig.build(self._taxoWriter, document)
        self._indexWriter.updateDocument(term, document)

    def deleteDocument(self, term):
        self._indexWriter.deleteDocuments(term)

    def search(self, query, filter, collector):
        self._indexAndTaxonomy.searcher.search(query, filter, collector)

    def suggest(self, query, count, field):
        suggestions = {}
        for token, startOffset, endOffset in self._analyzeToken(query):
            suggestWords = self._checker.suggestSimilar(Term(field, token), count, self._indexAndTaxonomy.searcher.getIndexReader())
            if suggestWords:
                suggestions[token] = (startOffset, endOffset, [suggestWord.string for suggestWord in suggestWords])
        return suggestions

    def termsForField(self, field, prefix=None, limit=10, **kwargs):
        convert = lambda term: term.utf8ToString()
        terms = []
        termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field)
        if termsEnum is None:
            return terms
        iterator = termsEnum.iterator(None)
        if prefix:
            iterator.seekCeil(BytesRef(prefix))
            terms.append((iterator.docFreq(), convert(iterator.term())))
        bytesIterator = BytesRefIterator.cast_(iterator)
        try:
            while len(terms) < limit:
                term = convert(bytesIterator.next())
                if prefix and not term.startswith(prefix):
                    break
                terms.append((iterator.docFreq(), term))
        except StopIteration:
            pass
        return terms

    def fieldnames(self):
        indexAndTaxonomy = self._indexAndTaxonomy
        fieldnames = []
        fields = MultiFields.getFields(indexAndTaxonomy.searcher.getIndexReader())
        if fields is None:
            return fieldnames
        iterator = fields.iterator()
        while iterator.hasNext():
            fieldnames.append(iterator.next())
        return fieldnames

    def drilldownFieldnames(self, path=None, limit=50):
        taxoReader = self._indexAndTaxonomy.taxoReader
        parentOrdinal = TaxonomyReader.ROOT_ORDINAL if path is None else taxoReader.getOrdinal(path[0], path[1:])
        childrenIter = taxoReader.getChildren(parentOrdinal)
        names = []
        while True:
            ordinal = childrenIter.next()
            if ordinal == TaxonomyReader.INVALID_ORDINAL:
                break
            names.append(taxoReader.getPath(ordinal).components[-1])
            if len(names) >= limit:
                break
        return names

    def numDocs(self):
        return self._indexAndTaxonomy.searcher.getIndexReader().numDocs()

    def commit(self):
        if not self._settings.readonly:
            self._taxoWriter.commit()
            self._indexWriter.commit()
        self._indexAndTaxonomy.reopen()

    def getDocument(self, docId):
        return self._indexAndTaxonomy.searcher.doc(docId)

    def createFacetCollector(self):
        if not self._multithreaded:
            return FacetsCollector()
        return FacetSuperCollector(self._indexAndTaxonomy.taxoReader, self._facetsConfig, self._ordinalsReader)

    def facetResult(self, facetCollector):
        facetResult = TaxonomyFacetCounts(self._ordinalsReader, self._indexAndTaxonomy.taxoReader, self._facetsConfig, facetCollector)
        return Facets.cast_(facetResult)

    def close(self):
        self._indexAndTaxonomy.close()
        if not self._settings.readonly:
            self._taxoWriter.close()
            self._indexWriter.close()

    def _analyzeToken(self, token):
        result = []
        reader = StringReader(unicode(token))
        stda = self._settings.analyzer
        ts = stda.tokenStream("dummy field name", reader)
        termAtt = ts.addAttribute(CharTermAttribute.class_)
        offsetAtt = ts.addAttribute(OffsetAttribute.class_)
        try:
            ts.reset()
            while ts.incrementToken():
                result.append((termAtt.toString(), offsetAtt.startOffset(), offsetAtt.endOffset()))
            ts.end()
        finally:
            ts.close()
        return result
예제 #37
0
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

# indexing directory
indexDir = FSDirectory.open(File("lucene_index.Index"))

# input which will be indexed with Lucene
title1 = "text of title1"
title2 = "title2"
abstract1 = "abstract1 has many words, e.g. hellow world can be the text"
abstract2 = "text of abstract2"

# configure indexing
config = IndexWriterConfig(Version.LUCENE_CURRENT,
                           WhitespaceAnalyzer(Version.LUCENE_CURRENT))
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
iw = IndexWriter(indexDir, config)

# count number of documents processed
nDocsAdded = 0

# create first document
doc = Document()
doc.add(TextField("Title", title1, Field.Store.YES))
doc.add(TextField("Abstract", abstract1, Field.Store.YES))
iw.addDocument(doc)
nDocsAdded += 1

# create second document
doc = Document()
doc.add(TextField("Title", title2, Field.Store.YES))
doc.add(TextField("Abstract", abstract2, Field.Store.YES))
예제 #38
0
class LuceneSearch():
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()

        if prm.idf_path:
            print 'Loading IDF dictionary...'
            self.idf = pkl.load(open(prm.idf_path))

    def get_title_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        title_id = {}
        id_title = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            title = doc['title']
            title_id[title] = idd
            id_title[idd] = title

        return title_id, id_title

    def add_idf(self, txt):
        txt = utils.clean(txt)
        txt = txt.lower()
        df = set()
        for word in wordpunct_tokenize(txt):
            if word not in df:
                df.add(word)
                self.idf[word] += 1.

    def add_doc(self, doc_id, title, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            if prm.top_tfidf > 0:
                words_idx = []
                words, _ = utils.top_tfidf(txt.lower(), self.idf,
                                           prm.top_tfidf, prm.min_term_freq)

                if len(words) == 0:
                    words.append('unk')

                for w in words:
                    if w in self.vocab:
                        words_idx.append(self.vocab[w])
                    else:
                        words_idx.append(-1)  # unknown words.

            else:
                txt_ = txt.lower()
                words_idx, words = utils.text2idx2([txt_], self.vocab,
                                                   prm.max_terms_per_doc)
                words_idx = words_idx[0]
                words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)

    def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        if add_terms:
            if prm.top_tfidf > 0 or prm.idf_path:
                print 'Creating IDF dictionary...'
                self.idf = defaultdict(int)
                doc_id = 0
                if docs_path.lower().endswith('.hdf5'):
                    import corpus_hdf5
                    corpus = corpus_hdf5.CorpusHDF5(docs_path)
                    for txt in corpus.get_text_iter():
                        self.add_idf(txt)

                        if doc_id % 1000 == 0:
                            print 'Creating IDF, doc', doc_id
                        doc_id += 1

                else:
                    # ClueWeb09
                    import warc
                    import gzip
                    from bs4 import BeautifulSoup
                    # list all files in the folder.
                    paths = []
                    for root, directories, filenames in os.walk(docs_path):
                        for filename in filenames:
                            paths.append(os.path.join(root, filename))

                    for path in paths:
                        with gzip.open(path, mode='rb') as gzf:
                            for record in warc.WARCFile(fileobj=gzf):
                                # remove html tags
                                txt = BeautifulSoup(
                                    record.payload[:1000 * 1000],
                                    "lxml").get_text()
                                # remove WARC headers.
                                txt = '\n'.join(txt.split('\n')[10:])

                                self.add_idf(txt)

                                if doc_id % 1000 == 0:
                                    print 'Creating IDF, doc', doc_id
                                doc_id += 1

                for key, val in self.idf.items():
                    self.idf[key] = math.log(float(doc_id) / val)

                pkl.dump(self.idf, open(prm.idf_path, 'wb'))

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0
        if docs_path.lower().endswith('.hdf5'):
            import corpus_hdf5
            corpus = corpus_hdf5.CorpusHDF5(docs_path)
            for txt in corpus.get_text_iter():
                title = corpus.get_article_title(doc_id)
                self.add_doc(doc_id, title, txt, add_terms)
                if doc_id % 1000 == 0:
                    print 'indexing doc', doc_id
                doc_id += 1
        else:
            # ClueWeb09
            import warc
            import gzip
            from bs4 import BeautifulSoup

            # list all files in the folder.
            paths = []
            for root, directories, filenames in os.walk(docs_path):
                for filename in filenames:
                    paths.append(os.path.join(root, filename))

            for path in paths:
                with gzip.open(path, mode='rb') as gzf:
                    for record in warc.WARCFile(fileobj=gzf):
                        if 'warc-trec-id' in record:
                            title = record['warc-trec-id']
                        else:
                            title = record['warc-record-id']
                        # remove html tags
                        #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text()
                        txt = record.payload[:1000 * 1000]
                        # remove WARC headers.
                        txt = '\n'.join(txt.split('\n')[10:])

                        self.add_doc(doc_id, title, txt, add_terms)
                        if doc_id % 1000 == 0:
                            print 'indexing doc', doc_id
                        doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()

    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)

        return out

    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND',
                              '\\AND').replace('OR',
                                               '\\OR').replace('NOT', '\\NOT')
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))
            except:
                print 'Unexpected error when processing query:', str(q)
                print 'Using query "dummy".'
                q = 'dummy'
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = map(int, doc['word_idx'].split(' '))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                c[int(doc['id'])] = [word_idx, word]

            return c

    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND', '\\AND').replace('OR',
                                                          '\\OR').replace(
                                                              'NOT', '\\NOT')
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape(q))
                except:
                    print 'Unexpected error when processing query:', str(q)
                    print 'Using query "dummy".'
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = map(int, doc['word_idx'].split(' '))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    c[int(doc['id'])] = [word_idx, word]

                out.append(c)

        return out

    def get_candidates(self,
                       qs,
                       max_cand,
                       max_full_cand=None,
                       save_cache=False,
                       extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2,
                                          self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand,
                                                max_full_cand,
                                                self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2,
                                           self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand,
                                                 max_full_cand,
                                                 self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in itertools.izip(out, terms):
                for cand_id, term in itertools.izip(
                        outt.keys()[:max_full_cand], termss.values()):
                    outt[cand_id] = term

        if save_cache:
            for q, c in itertools.izip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out
예제 #39
0
 def testDelete(self, fieldName, searchString):
     config = IndexWriterConfig(Version.LUCENE_CURRENT, self.Analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
     writer = IndexWriter(self.store, config)
     writer.deleteDocuments(Term(fieldName, searchString))
     writer.close()
class LuceneHelper:
    def __init__(self, index_dir):
        self.index_dir = index_dir
        self.indexDir = SimpleFSDirectory(File(self.index_dir).toPath())
        self.q_parser = QueryParser("", WhitespaceAnalyzer())
        self.commit_max = 500000
        self.__get_writer_searcher()

    def __get_writer_searcher(self):
        writerConfig = IndexWriterConfig()
        print(f"Codec : {writerConfig.getCodec()}")
        self.writer = IndexWriter(self.indexDir, writerConfig)

        self.reader = DirectoryReader.open(self.writer)
        self.searcher = IndexSearcher(self.reader)

    def __query(self, query_str, _max=10):
        if self.searcher is None:
            return None
        query_cmd = self.q_parser.parse(query_str)
        hits = self.searcher.search(query_cmd, _max)
        print(
            f"Found {hits.totalHits} document(s) that matched query :'{query_cmd}'"
        )
        return hits

    def __count_docs(self, query_str):
        if self.searcher is None:
            return None
        query_cmd = self.q_parser.parse(query_str)
        total = self.searcher.count(query_cmd)
        print(f"Found {total} document(s) that matched query :'{query_cmd}'")
        return total

    def refresh_searcher(self):
        self.reader.close()
        self.reader = DirectoryReader.open(self.indexDir)
        self.searcher = IndexSearcher(self.reader)

    def index_stats(self):
        query_str = f"*:*"
        total_docs = self.__count_docs(query_str)
        if total_docs:
            print(f"There is at least total [{total_docs}] docs.")
        else:
            print("There is no index right now.")

    def delete_old_ttl(self):
        now_time = int(time.time())
        # check how many docs expired
        ttl_query = LongPoint.newRangeQuery("ttl", 0, now_time - 1)
        total_docs = self.searcher.count(ttl_query)
        print(f"At least found {total_docs} document(s) are expired.")
        # delete expired docs
        self.writer.deleteDocuments(ttl_query)
        self.writer.commit()

    def add_doc(self, item_data):
        item_id = item_data['item_id']
        ttl = item_data['ttl']
        version = item_data.get('version', 'default')
        view_similar = json.dumps(item_data.get('view_similar', {}))
        view_prospective = json.dumps(item_data.get('view_prospective', {}))

        doc = Document()
        _id = hashlib.md5(f"{item_id}_{version}".encode('utf-8')).hexdigest()
        doc.add(StringField("id", _id, Field.Store.NO))
        doc.add(LongPoint("ttl", ttl))
        doc.add(StringField("version", version, Field.Store.YES))
        doc.add(StringField("item_id", item_id, Field.Store.YES))
        doc.add(StoredField("view_similar", view_similar))
        doc.add(StoredField("view_prospective", view_prospective))
        self.writer.updateDocument(Term("id", _id), doc)

    def commit(self):
        self.writer.commit()

    def close(self):
        self.writer.commit()
        self.reader.close()
        self.writer.close()
예제 #41
0
        output_path = sys.argv[2]

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    analyzer_ws = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
    std_path = "%s/lucene_full_standard/" % (output_path)
    ws_path = "%s/lucene_full_ws/" % (output_path)
    if os.path.exists(std_path):
        os.remove(std_path)
    if os.path.exists(ws_path):
        os.remove(ws_path)
    indexDir1 = SimpleFSDirectory(File(std_path))
    indexDir2 = SimpleFSDirectory(File(ws_path))
    writerConfig1 = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writerConfig2 = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer_ws)
    writer1 = IndexWriter(indexDir1, writerConfig1)
    writer2 = IndexWriter(indexDir2, writerConfig2)

    print "%d docs in index1" % writer1.numDocs()
    print "%d docs in index2" % writer2.numDocs()
    print "Reading lines from sys.stdin..."

    ftypes = open(LUCENE_TYPES_FILE, "w")

    for n, l in enumerate(sys.stdin):
        doc = Document()
        doc_lc = Document()
        fields = l.rstrip().split("\t")
        all_ = []
        if n == 0:
            sys.stdout.write("TYPES_HEADER")
예제 #42
0
class Index:
    def __init__(self, folder=None, fields=[], similarity="tfidf"):

        self.jcc = lucene.initVM()

        if folder:
            self.directory = SimpleFSDirectory(File(folder))
        else:
            self.directory = RAMDirectory()

        self.fields = {}

        for field in fields:
            ft = FieldType()
            for pname, pvalue in field.props.items():
                setter = getattr(ft, "set" + pname.capitalize())
                setter(pvalue)

            ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
            # 			ft.setOmitNorms(True)

            self.fields[field.name] = ft

        self.similarity = similarity.lower()
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.writer = None
        self.searcher = None

    def attach_thread(self):
        self.jcc.attachCurrentThread()

    def open_writer(self):

        config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        self.writer = IndexWriter(self.directory, config)

    def add(self, **doc):

        if not self.writer:
            self.open_writer()

        d = Document()
        for field, value in doc.items():
            #			try :
            d.add(Field(field, value, self.fields[field]))


#			except Exception, e :
#				print
#				print "Fudeu"
#				pass

        self.writer.addDocument(d)

    def commit(self):
        self.writer.commit()

    def close(self):
        if self.writer:
            self.writer.close()

    def open_searcher(self):
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)
        if (self.similarity == "bm25"):
            self.searcher.setSimilarity(BM25Similarity())

    def preprocess_query(self, query, fields, mode="ANY"):
        '''
		Fix query according to provided mode. If the value is not supported, 
		the query remains unchanged
		'''

        terms = query.lower().strip().split()
        if mode == "ANY":
            query = " OR ".join(terms)
        elif mode == "ALL":
            query = " AND ".join(terms)
        else:
            print "Invalid mode parameter '%s'." % mode

        query = QueryParser.escape(query)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields,
                                       self.analyzer)
        query = MultiFieldQueryParser.parse(parser, query)
        return query

    def search(self,
               query,
               search_fields,
               return_fields,
               filter=None,
               ignore=set(),
               mode="ANY",
               return_scores=False,
               limit=1000000):
        '''
		Search documents in the index using a standard analyzer (tokenizes and 
		removes top words). Supports two search modes: ANY and ALL
		  ANY: include documents that contain at least one term of the query.
		  ALL: include only documents that contain all terms of the query. 
		'''

        if not self.searcher:
            self.open_searcher()

        # Return empty results if query is empty (Lucene can't handle it nicely)
        if query.strip() == '':
            if return_scores:
                return [], []
            else:
                return []

        query = self.preprocess_query(query, search_fields, mode)

        # If limit is not provided, return all matched documents. A little hack is required
        # to do that. We query for one document and get the count total matched documents.
        #		if not limit :
        #			hits = self.searcher.search(query, 1)
        #			limit = hits.totalHits

        # Fetch more than asked in case we have to remove entries from the ignore set
        if limit != None:
            limit += len(ignore)

        hits = self.searcher.search(query, filter, limit)
        hits = hits.scoreDocs

        docs = []
        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            if doc['id'] not in ignore:
                docs.append([doc[f] for f in return_fields])

        if return_scores:
            scores = [hit.score for hit in hits]
            return docs[:limit], scores[:limit]

        return docs[:limit]

    def explain(self, query, fields, doc):

        if not self.searcher:
            self.open_searcher()

        query = QueryParser.escape(query)

        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields,
                                       self.analyzer)
        query = MultiFieldQueryParser.parse(parser, query)

        return self.searcher.explain(query, doc)

    def get_documents(self, doc_ids, fields):

        docs = []
        for doc_id in doc_ids:
            doc = self.reader.document(doc_id)
            if isinstance(fields, basestring):
                docs.append(doc.get(fields))
            else:
                docs.append({f: doc.get(f) for f in fields})

        return docs

    def get_query_scores(self, query, fields, doc_ids, mode="ANY"):

        # Creates pre-filter to ignore all other documents
        filter = TermsFilter([Term("id", id) for id in doc_ids])

        query = self.preprocess_query(query, fields, mode)
        hits = self.searcher.search(query, filter, len(doc_ids)).scoreDocs

        # Creates scores' mapping using entity id instead of internal index id
        scores = {
            str(self.reader.document(hit.doc).get("id")): hit.score
            for hit in hits
        }

        # Normalize to 0..1 interval
        #		n = 1.0/sum(scores.values())
        #		scores

        # Adds to the mapping entries for the non-returned docs (no term found)
        for doc_id in doc_ids:
            if doc_id not in scores:
                scores[doc_id] = 0.0

        return scores
def main():
    if len(sys.argv) < 2:
        print('error: too few arguments')
        print('command:  python create_category_corpus.py NUMBER_TOP_CATEGORY')
        quit()

    NUMBER_TOP_CATEGORY = int(sys.argv[1])
    print('NUMBER_TOP_CATEGORY=%d' % (NUMBER_TOP_CATEGORY))

    print('loading category profiles')
    profile = load_zipped_pickle('category_profiles_dbpedia_201510.gz')
    print('finish loading category profiles')

    system_flag = platform.system()
    cwd = os.getcwd()

    # initialize mongo client
    if system_flag == 'Windows':
        client = pymongo.MongoClient("localhost", 27017)
    else:
        client = pymongo.MongoClient("localhost", 58903)

    db = client.wiki2015
    wiki_article_categories = db['article_categories']

    category_corpus = {}

    pkl_filename = 'category_dbpedia_corpus_top%d_fsdm3.pkl.gz' % (
        NUMBER_TOP_CATEGORY)
    if system_flag == 'Windows':
        lucene_dbpedia_fsdm = Lucene_Object('mmapDirectory\\dbpedia_v2_FSDM3',
                                            'BM25', True)
    else:
        lucene_dbpedia_fsdm = Lucene_Object(
            '%s/mmapDirectory/dbpedia_v2_FSDM3' % (cwd), 'BM25', True)

    cnt = 0
    if os.path.exists(pkl_filename) == True:
        #if False==True:
        print('loading category corpus')
        category_corpus = load_zipped_pickle(pkl_filename)
    else:

        for item in wiki_article_categories.find():
            list_category = item['categories'].strip().split('|')
            uri_article = item['uri']
            title = findTitle(uri_article)

            entity_content_dict = {}
            doc_entity = lucene_dbpedia_fsdm.findEntityDocFromIndex(
                title, 'title', False)
            if doc_entity is None:
                continue

            for f in [
                    'names', 'attributes', 'categories', 'similar_entities',
                    'related_entities', 'catchall'
            ]:
                entity_content_dict[f] = doc_entity[f]
                entity_content_dict['stemmed_' + f] = doc_entity['stemmed_' +
                                                                 f]

            if len(entity_content_dict['catchall'].strip()) == 0:
                continue

            for cat in list_category[:NUMBER_TOP_CATEGORY]:
                if ('<http://dbpedia.org/resource/Category:' + cat +
                        '>') not in profile:
                    continue
                if cat not in category_corpus:
                    category_corpus[cat] = []
                if len(category_corpus[cat]) < 300:
                    category_corpus[cat].append(entity_content_dict)

            #cnt+=1
            #if cnt>20:
            #break

        print('saving corpus to pkl.gz')
        save_zipped_pickle(category_corpus, pkl_filename)
    client.close()

    # begin write the data into index
    print('begin write into index')
    if system_flag == 'Windows':
        LUCENE_INDEX_DIR = 'mmapDirectory\\category_corpus_dbpedia201510_top' + str(
            NUMBER_TOP_CATEGORY) + '_fsdm3'
    else:
        LUCENE_INDEX_DIR = '%s/mmapDirectory/category_corpus_dbpedia201510_top' % (
            cwd) + str(NUMBER_TOP_CATEGORY) + '_fsdm3'

    # backup code files
    cmd = 'robocopy %s %s\code_files *.py' % (
        r'%cd%', LUCENE_INDEX_DIR
    ) if system_flag == 'Windows' else 'cp *.py %s/code_files' % (
        LUCENE_INDEX_DIR)
    os.system(cmd)

    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = SimpleAnalyzer()
    config = IndexWriterConfig(analyzer)

    # write data to index
    w = IndexWriter(index_mm, config)

    cnt = 0
    data = {}
    max_article_num = 0
    stemmer = SnowballStemmer('english')
    for cat, list_entity_dict in category_corpus.items():
        cat_label = cleanSentence(cat, True)
        data.clear()
        data['category'] = (cat, 'StringField')
        data['label'] = (cat_label, 'CUSTOM_FIELD_TEXT')
        data['stemmed_label'] = (stemSentence(cat_label, stemmer,
                                              True), 'CUSTOM_FIELD_TEXT')
        data['num_articles'] = (len(list_entity_dict), 'INTEGER_STORED')

        if data['num_articles'][0] > max_article_num:
            max_article_num = data['num_articles'][0]

        for f in [
                'names', 'attributes', 'categories', 'similar_entities',
                'related_entities', 'catchall'
        ]:
            contents = cleanSentence(
                ' '.join([dic[f] for dic in list_entity_dict]), True, ' ')
            data[f] = (contents, 'CUSTOM_FIELD_TEXT_NOT_STORED')
            data['stemmed_' + f] = (stemSentence(contents, stemmer, False),
                                    'CUSTOM_FIELD_TEXT_NOT_STORED')
        #print ('--------------------')
        # need to calculate corpus average length
        addDoc(w, data)

        #cnt+=1
        #if cnt>20:
        #break

    w.close()
    print('max article num=%d' % (max_article_num))
예제 #44
0
    def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        if add_terms:
            if prm.top_tfidf > 0 or prm.idf_path:
                print 'Creating IDF dictionary...'
                self.idf = defaultdict(int)
                doc_id = 0
                if docs_path.lower().endswith('.hdf5'):
                    import corpus_hdf5
                    corpus = corpus_hdf5.CorpusHDF5(docs_path)
                    for txt in corpus.get_text_iter():
                        self.add_idf(txt)

                        if doc_id % 1000 == 0:
                            print 'Creating IDF, doc', doc_id
                        doc_id += 1

                else:
                    # ClueWeb09
                    import warc
                    import gzip
                    from bs4 import BeautifulSoup
                    # list all files in the folder.
                    paths = []
                    for root, directories, filenames in os.walk(docs_path):
                        for filename in filenames:
                            paths.append(os.path.join(root, filename))

                    for path in paths:
                        with gzip.open(path, mode='rb') as gzf:
                            for record in warc.WARCFile(fileobj=gzf):
                                # remove html tags
                                txt = BeautifulSoup(
                                    record.payload[:1000 * 1000],
                                    "lxml").get_text()
                                # remove WARC headers.
                                txt = '\n'.join(txt.split('\n')[10:])

                                self.add_idf(txt)

                                if doc_id % 1000 == 0:
                                    print 'Creating IDF, doc', doc_id
                                doc_id += 1

                for key, val in self.idf.items():
                    self.idf[key] = math.log(float(doc_id) / val)

                pkl.dump(self.idf, open(prm.idf_path, 'wb'))

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0
        if docs_path.lower().endswith('.hdf5'):
            import corpus_hdf5
            corpus = corpus_hdf5.CorpusHDF5(docs_path)
            for txt in corpus.get_text_iter():
                title = corpus.get_article_title(doc_id)
                self.add_doc(doc_id, title, txt, add_terms)
                if doc_id % 1000 == 0:
                    print 'indexing doc', doc_id
                doc_id += 1
        else:
            # ClueWeb09
            import warc
            import gzip
            from bs4 import BeautifulSoup

            # list all files in the folder.
            paths = []
            for root, directories, filenames in os.walk(docs_path):
                for filename in filenames:
                    paths.append(os.path.join(root, filename))

            for path in paths:
                with gzip.open(path, mode='rb') as gzf:
                    for record in warc.WARCFile(fileobj=gzf):
                        if 'warc-trec-id' in record:
                            title = record['warc-trec-id']
                        else:
                            title = record['warc-record-id']
                        # remove html tags
                        #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text()
                        txt = record.payload[:1000 * 1000]
                        # remove WARC headers.
                        txt = '\n'.join(txt.split('\n')[10:])

                        self.add_doc(doc_id, title, txt, add_terms)
                        if doc_id % 1000 == 0:
                            print 'indexing doc', doc_id
                        doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()
class Lucene(object):

    # default fieldnames for id and contents
    FIELDNAME_ID = "id"
    FIELDNAME_CONTENTS = "contents"

    # internal fieldtypes
    # used as Enum, the actual values don't matter
    FIELDTYPE_ID = "id"
    FIELDTYPE_ID_TV = "id_tv"
    FIELDTYPE_TEXT = "text"
    FIELDTYPE_TEXT_TV = "text_tv"
    FIELDTYPE_TEXT_TVP = "text_tvp"

    def __init__(self, index_dir, use_ram=False, jvm_ram=None):
        global lucene_vm_init
        if not lucene_vm_init:
            if jvm_ram:
                # e.g. jvm_ram = "8g"
                print "Increased JVM ram"
                lucene.initVM(vmargs=['-Djava.awt.headless=true'], maxheap=jvm_ram)
            else:
                lucene.initVM(vmargs=['-Djava.awt.headless=true'])
            lucene_vm_init = True
        self.dir = SimpleFSDirectory(Paths.get(index_dir))

        self.use_ram = use_ram
        if use_ram:
            print "Using ram directory..."
            self.ram_dir = RAMDirectory(self.dir, IOContext.DEFAULT)
        self.analyzer = None
        self.reader = None
        self.searcher = None
        self.writer = None
        self.ldf = None
        print "Connected to index " + index_dir

    def get_version(self):
        """Get Lucene version."""
        #return Version.LUCENE_48
        return lucene.VERSION

    def get_analyzer(self):
        """Get analyzer."""
        if self.analyzer is None:
            self.analyzer = StandardAnalyzer()
            #self.analyzer = SmartChineseAnalyzer()
        return self.analyzer

    def open_reader(self):
        """Open IndexReader."""
        if self.reader is None:
            if self.use_ram:
                print "reading from ram directory(%s) ..."%self.ram_dir
                self.reader = DirectoryReader.open(self.ram_dir)
            else:
                self.reader = DirectoryReader.open(self.dir)

    def get_reader(self):
        return self.reader

    def close_reader(self):
        """Close IndexReader."""
        if self.reader is not None:
            self.reader.close()
            self.reader = None
        else:
            raise Exception("There is no open IndexReader to close")

    def open_searcher(self):
        """
        Open IndexSearcher. Automatically opens an IndexReader too,
        if it is not already open. There is no close method for the
        searcher.
        """
        if self.searcher is None:
            self.open_reader()
            self.searcher = IndexSearcher(self.reader)

    def get_searcher(self):
        """Returns index searcher (opens it if needed)."""
        self.open_searcher()
        return self.searcher

    def open_writer(self):
        """Open IndexWriter."""
        if self.writer is None:
            config = IndexWriterConfig(self.get_analyzer())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            self.writer = IndexWriter(self.dir, config)
        else:
            raise Exception("IndexWriter is already open")

    def close_writer(self):
        """Close IndexWriter."""
        if self.writer is not None:
            self.writer.close()
            self.writer = None
        else:
            raise Exception("There is no open IndexWriter to close")

    def add_document(self, contents):
        """
        Adds a Lucene document with the specified contents to the index.
        See LuceneDocument.create_document() for the explanation of contents.
        """
        if self.ldf is None:  # create a single LuceneDocument object that will be reused
            self.ldf = LuceneDocument()
        self.writer.addDocument(self.ldf.create_document(contents))

    def get_lucene_document_id(self, doc_id):
        """Loads a document from a Lucene index based on its id."""
        self.open_searcher()
        query = TermQuery(Term(self.FIELDNAME_ID, doc_id))
        tophit = self.searcher.search(query, 1).scoreDocs
        if len(tophit) == 1:
            return tophit[0].doc
        else:
            return None

    def get_document_id(self, lucene_doc_id):
        """Gets lucene document id and returns the document id."""
        self.open_reader()
        return self.reader.document(lucene_doc_id).get(self.FIELDNAME_ID)

    def get_id_lookup_query(self, id, field=None):
        """Creates Lucene query for searching by (external) document id """
        if field is None:
            field = self.FIELDNAME_ID
        return TermQuery(Term(field, id))

    def get_and_query(self, queries):
        """Creates an AND Boolean query from multiple Lucene queries """
        # empty boolean query with Similarity.coord() disabled
        bq_builder = BooleanQuery.Builder()
        for q in queries:
            bq_builder.add(q, BooleanClause.Occur.MUST)
        bq = bq_builder.build()
        return bq

    def get_or_query(self, queries):
        """Creates an OR Boolean query from multiple Lucene queries """
        # empty boolean query with Similarity.coord() disabled
        bq_builder = BooleanQuery.Builder()
        for q in queries:
            bq_builder.add(q, BooleanClause.Occur.SHOULD)
        bq = bq_builder.build()
        return bq

    def get_phrase_query(self, query, field):
        """Creates phrase query for searching exact phrase."""
        terms = []
        trans_query = ''
        #process Chinese query
        for c in query:
            if ord(c) >=256:
                trans_query += '%s '%c
            else:
                trans_query += c
        for t in trans_query.split():
            #term = Term(field, t)
            #terms.append(term)
            terms.append(t)
            #phq.add(Term(field, t))
        phq = PhraseQuery(field, terms)
        return phq

    def num_docs(self):
        """Returns number of documents in the index."""
        self.open_reader()
        return self.reader.numDocs()
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.store import RAMDirectory
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.util import BytesRef, BytesRefIterator, Version
from org.apache.lucene.index import \
    IndexWriterConfig, IndexWriter, DirectoryReader

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

directory = RAMDirectory()
iconfig = IndexWriterConfig(
    Version.LUCENE_CURRENT,
    LimitTokenCountAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT), 100))
iwriter = IndexWriter(directory, iconfig)

ft = FieldType()
ft.setIndexed(True)
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)

ts = [
    "this bernhard is the text to be index text",
    "this claudia is the text to be indexed"
]
for t in ts:
    doc = Document()
예제 #47
0
    def open_writer(self):

        config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        self.writer = IndexWriter(self.directory, config)
예제 #48
0
def main():
    if len(sys.argv) < 2:
        print('error: too few arguments')
        print('command:  python build_index_wikipedia.py FILENAME')
        quit()

    # create file object
    filename = sys.argv[1]
    print('processing ' + filename)

    cnt = 0
    stemmer = SnowballStemmer('english')

    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True
    except:
        print('JavaVM already running')

    LUCENE_INDEX_DIR = 'mmapDirectory\\index_wikipedia_2015'
    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    #config=config.setRAMBufferSizeMB(1024.0)  # experimental setting !!

    # write data to index
    if not is_index_Exist:
        print('begin backup code files')
        system_flag = platform.system()
        cmd = 'robocopy %s %s\code_files *.py' % (
            r'%cd%', LUCENE_INDEX_DIR
        ) if system_flag == 'Windows' else 'cp *.py %s\code_files' % (
            LUCENE_INDEX_DIR)
        os.system(cmd)

        w = IndexWriter(index_mm, config)

    data = {}
    with open(filename, 'r', encoding='utf-8') as src:
        for page_pair in extract_pages(src):
            label, content, page_id = page_pair[0], page_pair[1], page_pair[2]

            pair_tokens = process_article((content, False, label, page_id))
            content = remove_stopwords(' '.join(pair_tokens[0]), ' ')

            if len(content.split()) < 10:
                continue

            stemmed_content = stemSentence(content, stemmer, False)

            if DEBUG_MODE == True:
                try:
                    print('%s\n%s\n%s\n%s' %
                          (label, page_id, content, stemmed_content))
                except:
                    print('encoding error')

            data.clear()
            data['label'] = (label, 'StringField')
            data['label_lower'] = (label.lower(), 'StringField')
            data['label_lower_text'] = (label.lower(), 'TextField')
            data['wiki_id'] = (page_id, 'StringField')
            #data['content']=(content,'TextField')
            data['stemmed_content'] = (stemmed_content, 'TextField')
            addDoc(w, data)

            cnt += 1
            #if cnt>20:
            #break
            if cnt % 5000 == 0:
                print('finish %d' % (cnt))

    w.close()
예제 #49
0
def create_document(file_name):
    path = './alldocs/' + file_name
    file = open(path)
    doc = Document()
    doc.add(StringField("title", input_file, Field.Store.YES))
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()
    return doc


# Initialize lucene and the JVM
lucene.initVM()

directory = RAMDirectory()
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
analyzer = LimitTokenCountAnalyzer(analyzer, NoT)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
writer = IndexWriter(directory, config)

print "Number of indexed documents: %d\n" % writer.numDocs()
for input_file in listdir(INPUT_DIR):
    print "Current file:", input_file
    doc = create_document(input_file)
    writer.addDocument(doc)

print "\nNumber of indexed documents: %d" % writer.numDocs()
writer.close()
print "Indexing done!\n"
print "------------------------------------------------------"