示例#1
0
    def __init__(self, root, storeDir_good, storeDir_bad, analyzer):

        if not os.path.exists(storeDir_good):
            os.mkdir(storeDir_good)
        if not os.path.exists(storeDir_bad):
            os.mkdir(storeDir_bad)

        store_good = SimpleFSDirectory(File(storeDir_good))
        store_bad = SimpleFSDirectory(File(storeDir_bad))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        config1 = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config1.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer_good = IndexWriter(store_good, config)
        writer_bad = IndexWriter(store_bad, config1)

        self.indexDocs(root, writer_good, writer_bad)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer_good.commit()
        writer_good.close()
        writer_bad.commit()
        writer_bad.close()
        ticker.tick = False
        print 'done'
示例#2
0
def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()
示例#3
0
    def __init__(self, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.folders = {
            'parsed_ctrip':
            ['source', 'location', 'introduction', 'score', 'img_list'],
            'parsed_qunar':
            ['location', 'rank', 'score', 'time', 'introduction', 'img_list'],
            'eic_mfw': ['location', 'introduction', 'img_list']
        }
        self.special_tags = ['introduction']
        self.files = self.__getAllPlaces()
        #self.readers = self.__constructReaders()

        self.indexDocs(writer)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
    def __init__(self,
                 LUCENE_INDEX_DIR,
                 similarity='BM25',
                 lucene_vm_flag=False,
                 is_bigram_cache_used=False,
                 mongoObj=None):
        if lucene_vm_flag == False:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.lucene_vm_init = True
        self.index_dir = LUCENE_INDEX_DIR
        self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
        self.analyzer = SimpleAnalyzer()
        self.config = IndexWriterConfig(self.analyzer)
        self.reader = DirectoryReader.open(self.index_mm)
        self.searchers = []
        self.searchers.append(IndexSearcher(self.reader))
        if similarity == 'BM25':
            (self.searchers[0]).setSimilarity(BM25Similarity())

        # load bigram cache
        self.is_bigram_cache_used = is_bigram_cache_used
        if is_bigram_cache_used == True:
            seperate_char = '/' if self.index_dir.find('/') > -1 else '\\'
            index_name = self.index_dir.split(seperate_char)[-1]
            self.index_name = index_name
            self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache']
            self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
            if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache_with_wikipedia']
            else:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache']
示例#5
0
    def createIndexNoStopwords(texts, route, rebuild):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        for key in texts:
            doc = Document()
            doc.add(
                Field("docName", "doc", Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("content", texts[key], Field.Store.YES,
                      Field.Index.ANALYZED, Field.TermVector.YES))
            iwriter.addDocument(doc)

        iwriter.close()
示例#6
0
	def __init__(self, indexDir):
		f = Paths.get(indexDir)
		self._dir = SimpleFSDirectory(f)
		analyzer = StandardAnalyzer()
		config = IndexWriterConfig(analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
		self._writer = IndexWriter(self._dir, config)
示例#7
0
def main():
	try:
		indicesDestination = File(dest_path)
		analyzer = KeywordAnalyzer()
		porter_analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		a = {"code": porter_analyzer, "description": porter_analyzer, "typed_method_call": KeywordAnalyzer(),
			 "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": porter_analyzer}
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

		writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
		counter = Counter()
		generate_indices_from_benchmark(writer, counter)
		writer.close()

		print "All jobs are done.."
		print str(counter)

	except CorruptIndexException as e:		#when index is corrupt
			e.printStackTrace()
	except LockObtainFailedException as e:	#when other writer is using the index
			e.printStackTrace()
	except IOException as e:	#when directory can't be read/written
			e.printStackTrace()
示例#8
0
    def create_index(self, index_folder, docs_path, add_terms=False):
        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0

        import corpus_hdf5
        corpus = corpus_hdf5.CorpusHDF5(docs_path)
        for txt in corpus.get_text_iter():
            title = corpus.get_article_title(doc_id)
            self.add_doc(doc_id, title, txt, add_terms)
            if doc_id % 1000 == 0:
                print 'indexing doc', doc_id
            doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()
示例#9
0
    def __init__(self, indexDir, doClear=True, computeLengthNorm=False):
        #         if not jpype.isJVMStarted():
        #         lucene.initVM()
        lucene.getVMEnv().attachCurrentThread()
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        # self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 100678)#is here?
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setRAMBufferSizeMB(256.0)  # 设置自动提交的最大RAM为256MB
        self.config.setMaxBufferedDocs(10000)  # 设置自动提交的最大Docs个数为10000
        if not computeLengthNorm:
            sim = CustomSimilarity()
            self.config.setSimilarity(sim)
        self.path = os.path.join(INDEX_PATH, indexDir)
        # print self.path
        # path.mkdir(self.path)
        #         if doClear:
        #             self.clearExistingIndex()
        self.store = SimpleFSDirectory(File(self.path))
        self.writer = IndexWriter(self.store, self.config)

        self.t1 = FieldType()  # 域t1
        self.t1.setIndexed(True)
        self.t1.setStored(True)
        self.t1.setTokenized(False)
        self.t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        self.t2 = FieldType()  # 域t2
        self.t2.setIndexed(True)
        self.t2.setStored(False)
        self.t2.setTokenized(True)
        self.t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)

        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        # print "init done"

        writer = IndexWriter(store, config)
        self.testDelete(root, writer)
        ticker = Ticker()
        print 'commit index deletion',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        # writer.close()
        ticker.tick = False
        print 'done'
        end["delete"] = datetime.now() - start

        # writer = IndexWriter(store, config)
        self.testAdd(root, writer)
        ticker = Ticker()
        print 'commit index addition',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
示例#11
0
    def __init__(self, root, storedir, isindexing=False, isBM25=True):

        if not os.path.exists(storedir):
            os.mkdir(storedir)

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)

        if isindexing:
            store = SimpleFSDirectory(Paths.get(storedir))
            config = IndexWriterConfig(self.analyzer)
            # TODO BM25 parameter tuning
            if isBM25:
                config.setSimilarity(BM25Similarity())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            writer = IndexWriter(store, config)

            self.indexer(root, writer)
            ticker = Ticker()
            print('commit index')
            threading.Thread(target=ticker.run).start()
            writer.commit()
            writer.close()
            ticker.tick = False
            print('done')

        search_dir = SimpleFSDirectory(Paths.get(storedir))
        self.searcher = IndexSearcher(DirectoryReader.open(search_dir))
        if isBM25:
            self.searcher.setSimilarity(BM25Similarity())
示例#12
0
    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)

        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        if 1 == INDEX_MODE:  # APPEND
            config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
        elif 2 == INDEX_MODE:  # CREATE
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:  # CREATE_OR_APPEND
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        # print "init done"
        writer = IndexWriter(store, config)
        # print "init 2 done"

        self.indexDocs(root, writer)
        ticker = Ticker()
        print '\ncommit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
示例#13
0
def main():
	try:
		print "Indexing..."
		indexDestination = File("/Users/Falcon/Desktop/New_Indices/Stack_Q_Indices")
		# writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
		analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(),
			 "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(),
			 "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()}
		#KeywordAnalyzer : 필드의 전체 원문을 하나의 토큰으로 처리
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
		#PerFieldAnalyzerWrapper : 필드별로 분석기를 지정하는 기능을 지원하는 클래스
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
		config.setInfoStream(System.out)  # 루씬 색인작업 디버깅 // 루크라는 도구를 사용해서 루씬 색인 관리를 할 수도 있음..
		writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

		counter = Counter()
		index_code_snippet(writer, counter)
		writer.commit()

		writer.close()
		print "Done"
		print str(counter)

	except CorruptIndexException as e:  # when index is corrupt
		e.printStackTrace()
	except LockObtainFailedException as e:  # when other writer is using the index
		e.printStackTrace()
	except IOException as e:  # when directory can't be read/written
		e.printStackTrace()
	except SQLException as e:  # when Database error occurs
		e.printStackTrace()
示例#14
0
def createWriter(index_dir):
    indexDir = SimpleFSDirectory(File(index_dir).toPath())
    writerConfig = IndexWriterConfig()
    print(Codec.availableCodecs())
    print(f"Codec : {writerConfig.getCodec()}")
    writer = IndexWriter(indexDir, writerConfig)
    return writer
示例#15
0
    def __init__(self, root, storeDir, doIndex=False):

        self.analyzer = StandardAnalyzer()

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        if doIndex:
            store = SimpleFSDirectory(Paths.get(storeDir))

            analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
            config = IndexWriterConfig(analyzer)
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            writer = IndexWriter(store, config)

            self.indexDocs(root, writer)
            ticker = Ticker()
            print("commit index")
            threading.Thread(target=ticker.run).start()
            writer.commit()
            writer.close()
            ticker.tick = False
            print("done")

        directory = SimpleFSDirectory(Paths.get(storeDir))
        self.searcher = IndexSearcher(DirectoryReader.open(directory))
示例#16
0
    def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Indexer.

        Parameters
        ----------
        index_dir : string
            The location of lucene index
        mode : string
            The mode when opening lucene index. Available values are:
                'create', open new index and overwriting over index,
                'append', open existed index and append.
                'create_or_append', if `index_dir` exists, 'append',
                else 'create'
        date_format : string
            We save datetime field as string, `date_format` specify how to
            format datetime into string.
        """
        # self.store = FSDirectory.open(File(index_dir))
        self.store = FSDirectory.open(Paths.get(index_dir))
        # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.analyzer = StandardAnalyzer()
        # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config = IndexWriterConfig(self.analyzer)
        self.mode = mode
        self.date_format = date_format
        if mode == 'create_or_append':
            self.config.setOpenMode(
                IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        elif mode == 'create':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        elif mode == 'append':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
        else:
            raise ValueError('Invalid mode %s', mode)
        self.writer = IndexWriter(self.store, self.config)
示例#17
0
def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/Users/Raphael/Downloads/stackoverflow1107")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "methods_called": KeywordAnalyzer(),
            "view_count": KeywordAnalyzer(),
            "code_hints": JavaCodeAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.commit()
        writer.close()
        print "Done"
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
    except SQLException as e:  #when Database error occurs
        e.printStackTrace()
示例#18
0
def create_index(storage, paths):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    import os
    for path in paths:
        for filen in os.listdir(path):
            text = sent_tokenize(get_data_from_file(path + filen))
            total_sent = len(text)
            for i in range(0, total_sent, 3):
                doc = Document()
                a = i - 5 if i - 5 > 0 else 0
                sentence = ' '.join(text[a:i + 5])
                doc.add(
                    Field("text", sentence, Field.Store.YES,
                          Field.Index.ANALYZED))
                writer.addDocument(doc)
            print("Done %s" % (path + filen))
            print "Indexed (%d docs in index)" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
示例#19
0
def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/home/ubuntu/Desktop/CoCaBu_remote/GitSearch/Indices")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = KeywordAnalyzer(
        )  #PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.close()
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
示例#20
0
def build_index():

    lucene.initVM()

    # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/'
    post_dir = '/Users/w3/data/github/codeif_backup'
    index_store_dir = current_app.config['INDEX_STORE_DIR']
    print post_dir
    print index_store_dir

    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    store = SimpleFSDirectory(File(index_store_dir))
    analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)

    indexDocs(post_dir, writer)
    ticker = Ticker()
    print 'commit index',
    threading.Thread(target=ticker.run).start()
    writer.commit()
    writer.close()
    ticker.tick = False
    print 'done'
示例#21
0
def main():
    try:
        print "Indexing starts..."
        # indicesDestination = File("/Users/Falcon/Desktop/dyclink_2014")############################################

        indicesDestination = File("/Indices/dyclink/2014")

        analyzer = KeywordAnalyzer(
        )  #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.)
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }  #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict())
        wrapper_analyzer = PerFieldAnalyzerWrapper(
            analyzer, a
        )  #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)

        counter = Counter()
        generate_indices_from_projects(writer, counter)
        writer.close()

        print "Done"
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
示例#22
0
def build_index(document_path, dir_path):
    lucene.initVM()
    index_dir = SimpleFSDirectory(Paths.get(dir_path))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    index_writer = IndexWriter(index_dir, config)

    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(True)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(True)
    t2.setTokenized(False)

    with open(document_path) as input_file:
        for line in input_file:
            segs = line.strip().split(" ")
            music_path, music_tags = segs[0], segs[1].split(",")

            document = Document()
            document.add(Field("content", " ".join(music_tags), t1))
            document.add(Field("url", music_path, t2))
            index_writer.addDocument(document)

    index_writer.close()
示例#23
0
    def createIndexStopwords(texts, route, rebuild):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        stopWords = [
            "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
            "if", "no", "not", "of", "on", "or", "such", "that", "the",
            "their", "then", "there", "these", "they", "this", "to", "was",
            "will", "with", "el", "la", "lo", "los", "las", "ante", "con",
            "sin", "que", "es", "de", "en", "por", "y", "los"
        ]
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        for key in texts:
            doc = Document()
            doc.add(
                Field("docName", key.__str__(), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("content", texts[key], Field.Store.YES,
                      Field.Index.ANALYZED, Field.TermVector.YES))
            iwriter.addDocument(doc)

        iwriter.close()
示例#24
0
    def __init__(self, index_store_path):

        store = NIOFSDirectory(Paths.get(index_store_path))
        analyzer = StandardAnalyzer()
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        self.writer = IndexWriter(store, config)
    def getConfig(self, analyzer):

        self.policy = MyDeletionPolicy()
        config = IndexWriterConfig(analyzer)
        config.setIndexDeletionPolicy(self.policy)

        return config
示例#26
0
    def create_index(self, index_folder):
        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(True)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(
            MySimpleAnalyzer(
                CharArraySet(collections.JavaSet(utils.STOPWORDS), True)))
        writerConfig.setSimilarity(MyTFIDFSimilarity())
        writerConfig.setRAMBufferSizeMB(16384.0)  # 14g
        self.writer = IndexWriter(fsDir, writerConfig)
        logger.info(f"{self.writer.numDocs()} docs in index")
        logger.info("Indexing documents...")

        doc_ids = self.doc_db.get_doc_ids()
        for doc_id in tqdm(doc_ids, total=len(doc_ids)):
            text = self.doc_db.get_doc_text(doc_id)
            tokens = self.doc_db.get_doc_tokens(doc_id)
            self.add_doc(doc_id, text, tokens)

        logger.info(f"Indexed {self.writer.numDocs()} docs.")
        self.writer.forceMerge(1)  # to increase search performance
        self.writer.close()
示例#27
0
def main():
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True
    except:
        print('JavaVM already running')

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = SimpleAnalyzer()
    config = IndexWriterConfig(analyzer)
    config = config.setRAMBufferSizeMB(1024.0)
    # write data to index

    if not is_index_Exist:
        print('begin backup code files')
        system_flag = platform.system()
        cmd = 'robocopy %s %s\code_files *.py' % (
            r'%cd%', LUCENE_INDEX_DIR
        ) if system_flag == 'Windows' else 'cp -f *.py %s\code_files' % (
            LUCENE_INDEX_DIR)
        os.system(cmd)

        w = IndexWriter(index_mm, config)
        makeIndex(w)
        w.close()
    else:
        print('index already exists, stop indexing')
示例#28
0
def main():
    LUCENE_INDEX_DIR = 'mmapDirectory/trec_v15_wikipedia_stemmed_v2'
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True
    except:
        print('JavaVM already running')

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    #config=config.setRAMBufferSizeMB(1024.0)  # experimental setting !!
    # write data to index

    if not is_index_Exist:
        #if True:
        print('begin backup code files')
        system_flag = platform.system()
        if system_flag == 'Windows':
            os.system('robocopy %s %s\code_files *.py' %
                      (r'%cd%', LUCENE_INDEX_DIR))
        else:
            os.system('mkdir %s/code_files' % (LUCENE_INDEX_DIR))
            os.system('cp *.py %s/code_files' % (LUCENE_INDEX_DIR))

        w = IndexWriter(index_mm, config)
        makeIndex(w)
        w.close()
    else:
        print('index already exists, stop indexing')
示例#29
0
文件: indexer.py 项目: zz-mars/rzync
    def __init__(self, root, storeDir, analyzer):
        # Create the index dir if it does not exist
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        # the SimpleFSDirectory which the index will be written in
        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        # create a index writer
        # atach the index dir and config info to it
        writer = IndexWriter(store, config)

        # call the indexing procedure
        # indexing all the files in the directory specified by root
        # write the index with writer
        self.indexDocs(root, writer)
        # start a ticker
        ticker = Ticker()
        print 'commit index'
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        # stop the ticker when the indexing procedure completes
        ticker.tick = False
        print 'Done'
示例#30
0
 def getWriter(self, store, analyzer=None, create=False):
     if analyzer is None:
         analyzer = StandardAnalyzer()
     config = IndexWriterConfig(analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(store, config)
     return writer