예제 #1
0
파일: indexer.py 프로젝트: zz-mars/rzync
    def __init__(self, root, storeDir, analyzer):
        # Create the index dir if it does not exist
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        # the SimpleFSDirectory which the index will be written in
        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        # create a index writer
        # atach the index dir and config info to it
        writer = IndexWriter(store, config)

        # call the indexing procedure
        # indexing all the files in the directory specified by root
        # write the index with writer
        self.indexDocs(root, writer)
        # start a ticker
        ticker = Ticker()
        print 'commit index'
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        # stop the ticker when the indexing procedure completes
        ticker.tick = False
        print 'Done'
예제 #2
0
    def __init__(self, root, storeDir, analyzer, type="html"):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)
        self.load_stop_words([
            "CNstopwords.txt",
            "ENstopwords.txt",
        ])
        self.html2text = HTML2Text()
        self.html2text.ignore_links = True
        self.html2text.ignore_images = True
        type_to_index = {
            "html": self.index_html,
            "image": self.index_image,
        }
        type_to_index[type](root, writer)
        ticker = Ticker()
        print('commit index')
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
예제 #3
0
    def __init__(self, root, storeDir_good, storeDir_bad, analyzer):

        if not os.path.exists(storeDir_good):
            os.mkdir(storeDir_good)
        if not os.path.exists(storeDir_bad):
            os.mkdir(storeDir_bad)

        store_good = SimpleFSDirectory(File(storeDir_good))
        store_bad = SimpleFSDirectory(File(storeDir_bad))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        config1 = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config1.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer_good = IndexWriter(store_good, config)
        writer_bad = IndexWriter(store_bad, config1)

        self.indexDocs(root, writer_good, writer_bad)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer_good.commit()
        writer_good.close()
        writer_bad.commit()
        writer_bad.close()
        ticker.tick = False
        print 'done'
예제 #4
0
class Indexer(object):

    def __init__(self, index_store_path):

        store = NIOFSDirectory(Paths.get(index_store_path))
        analyzer = StandardAnalyzer()
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        self.writer = IndexWriter(store, config)

    @abstractmethod
    def index_single_file(self, doc_file):
        pass

    def index_doc(self, doc_path):
        if os.path.isfile(doc_path):
            return 1, self.index_single_file(doc_path)

        # index all docs in doc_path dir
        total = 0
        doc_num = 0
        for root, _, files in os.walk(doc_path, topdown=False):
            for name in files:
                doc_file = os.path.join(root, name)
                total += self.index_single_file(doc_file)
                doc_num += 1
        return doc_num, total

    def __del__(self):
        logger.info('committing index...')
        self.writer.commit()
        self.writer.close()
        logger.info('done')
예제 #5
0
    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)

        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        if 1 == INDEX_MODE:  # APPEND
            config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
        elif 2 == INDEX_MODE:  # CREATE
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:  # CREATE_OR_APPEND
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        # print "init done"
        writer = IndexWriter(store, config)
        # print "init 2 done"

        self.indexDocs(root, writer)
        ticker = Ticker()
        print '\ncommit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
예제 #6
0
	def __init__(self,root,storeDir,analyzer):
		# Create the index dir if it does not exist 
		if not os.path.exists(storeDir):
			os.mkdir(storeDir)
		# the SimpleFSDirectory which the index will be written in
		store = SimpleFSDirectory(File(storeDir))
		analyzer = LimitTokenCountAnalyzer(analyzer,1048576)
		config = IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
		# create a index writer 
		# atach the index dir and config info to it
		writer = IndexWriter(store,config)

		# call the indexing procedure
		# indexing all the files in the directory specified by root
		# write the index with writer
		self.indexDocs(root,writer)
		# start a ticker
		ticker = Ticker()
		print 'commit index'
		threading.Thread(target=ticker.run).start()
		writer.commit()
		writer.close()
		# stop the ticker when the indexing procedure completes
		ticker.tick = False
		print 'Done'
    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)

        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        # print "init done"

        writer = IndexWriter(store, config)
        self.testDelete(root, writer)
        ticker = Ticker()
        print 'commit index deletion',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        # writer.close()
        ticker.tick = False
        print 'done'
        end["delete"] = datetime.now() - start

        # writer = IndexWriter(store, config)
        self.testAdd(root, writer)
        ticker = Ticker()
        print 'commit index addition',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
예제 #8
0
    def __init__(self, root, storedir, isindexing=False, isBM25=True):

        if not os.path.exists(storedir):
            os.mkdir(storedir)

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)

        if isindexing:
            store = SimpleFSDirectory(Paths.get(storedir))
            config = IndexWriterConfig(self.analyzer)
            # TODO BM25 parameter tuning
            if isBM25:
                config.setSimilarity(BM25Similarity())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            writer = IndexWriter(store, config)

            self.indexer(root, writer)
            ticker = Ticker()
            print('commit index')
            threading.Thread(target=ticker.run).start()
            writer.commit()
            writer.close()
            ticker.tick = False
            print('done')

        search_dir = SimpleFSDirectory(Paths.get(storedir))
        self.searcher = IndexSearcher(DirectoryReader.open(search_dir))
        if isBM25:
            self.searcher.setSimilarity(BM25Similarity())
예제 #9
0
def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/Users/Raphael/Downloads/stackoverflow1107")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "methods_called": KeywordAnalyzer(),
            "view_count": KeywordAnalyzer(),
            "code_hints": JavaCodeAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.commit()
        writer.close()
        print "Done"
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
    except SQLException as e:  #when Database error occurs
        e.printStackTrace()
예제 #10
0
    def __init__(self, root, storeDir, doIndex=False):

        self.analyzer = StandardAnalyzer()

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        if doIndex:
            store = SimpleFSDirectory(Paths.get(storeDir))

            analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
            config = IndexWriterConfig(analyzer)
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            writer = IndexWriter(store, config)

            self.indexDocs(root, writer)
            ticker = Ticker()
            print("commit index")
            threading.Thread(target=ticker.run).start()
            writer.commit()
            writer.close()
            ticker.tick = False
            print("done")

        directory = SimpleFSDirectory(Paths.get(storeDir))
        self.searcher = IndexSearcher(DirectoryReader.open(directory))
예제 #11
0
def index(personDB, familyDB, relationDB):
    #config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(indexDir, config)
#?#indexWriter.setRAMBufferSizeMB(50);  KOLLA 256

    mt = matchtext()

    for p in personDB.find({}, no_cursor_timeout=True):
        matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
        doc = Document()
        doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED))
        doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED))
        doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)

    #Family matchtext
    for f in familyDB.find():
        matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB)
        doc = Document()
        doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED))
        doc.add(Field('sex','FAM', StringField.TYPE_STORED))
        doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)

    writer.commit()
    writer.close()
    return
예제 #12
0
def main():
	try:
		print "Indexing..."
		indexDestination = File("/Users/Falcon/Desktop/New_Indices/Stack_Q_Indices")
		# writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
		analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(),
			 "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(),
			 "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()}
		#KeywordAnalyzer : 필드의 전체 원문을 하나의 토큰으로 처리
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
		#PerFieldAnalyzerWrapper : 필드별로 분석기를 지정하는 기능을 지원하는 클래스
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
		config.setInfoStream(System.out)  # 루씬 색인작업 디버깅 // 루크라는 도구를 사용해서 루씬 색인 관리를 할 수도 있음..
		writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

		counter = Counter()
		index_code_snippet(writer, counter)
		writer.commit()

		writer.close()
		print "Done"
		print str(counter)

	except CorruptIndexException as e:  # when index is corrupt
		e.printStackTrace()
	except LockObtainFailedException as e:  # when other writer is using the index
		e.printStackTrace()
	except IOException as e:  # when directory can't be read/written
		e.printStackTrace()
	except SQLException as e:  # when Database error occurs
		e.printStackTrace()
예제 #13
0
    def __init__(self, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.folders = {
            'parsed_ctrip':
            ['source', 'location', 'introduction', 'score', 'img_list'],
            'parsed_qunar':
            ['location', 'rank', 'score', 'time', 'introduction', 'img_list'],
            'eic_mfw': ['location', 'introduction', 'img_list']
        }
        self.special_tags = ['introduction']
        self.files = self.__getAllPlaces()
        #self.readers = self.__constructReaders()

        self.indexDocs(writer)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
예제 #14
0
def build_index():

    lucene.initVM()

    # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/'
    post_dir = '/Users/w3/data/github/codeif_backup'
    index_store_dir = current_app.config['INDEX_STORE_DIR']
    print post_dir
    print index_store_dir

    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    store = SimpleFSDirectory(File(index_store_dir))
    analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)

    indexDocs(post_dir, writer)
    ticker = Ticker()
    print 'commit index',
    threading.Thread(target=ticker.run).start()
    writer.commit()
    writer.close()
    ticker.tick = False
    print 'done'
예제 #15
0
    def index (cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        from org.apache.lucene.util import Version
        config = IndexWriterConfig(Version.LUCENE_42,
                                   WhitespaceAnalyzer(Version.LUCENE_42))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
        # FacetFields is a utility class for adding facet fields to a document:
        facet_fields = FacetFields(taxo)

        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [CategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List:
            facetList = Arrays.asList(facetList)

            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # use the FacetFields utility class for adding facet fields (i.e. the categories)
            # to the document (and, as required, to the taxonomy index)
            facet_fields.addFields(doc, facetList)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded +=1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
예제 #16
0
 def deleteRec(self, pid):
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
     writer = IndexWriter(self.indexDir, config)
     writer.deleteDocuments(Term('uid', pid))
     writer.commit()
     writer.close()
     self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     return
예제 #17
0
    def index(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        doc = self.buildDocument(data['fields'], data['record'])
        writer.addDocument(doc)

        writer.commit()
        writer.close()
예제 #18
0
 def deleteRec(self, pid):
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
     writer = IndexWriter(self.indexDir, config)
     writer.deleteDocuments(Term('uid', pid))
     writer.commit()
     writer.close()
     self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     return
예제 #19
0
class Indexer(object):
    def __init__(self, docDir, indexDir, analyzer):
        #set index dir
        if not os.path.exists(indexDir):
            os.makedirs(indexDir)
        self.indexDir = SimpleFSDirectory(Paths.get(indexDir))
        self.docDir = docDir

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)
        writerConfig = IndexWriterConfig(self.analyzer)
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(self.indexDir, writerConfig)
        self.indexing()

    def indexing(self):
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.NONE)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        for filename in os.listdir(self.docDir):
            if filename.endswith('.html') or filename.endswith('.htm'):
                with open(os.path.join(self.docDir, filename)) as f:
                    url = f.readline().strip()
                    htmlString = f.read()
                    #remove HTML markup
                    soup = BeautifulSoup(htmlString, 'html.parser')
                    # kill all script and style elements
                    for script in soup(["script", "style"]):
                        script.extract()  # rip it out
                    # get text
                    text = soup.get_text()
                    # break into lines and remove leading and trailing space on each
                    lines = (line.strip() for line in text.splitlines())
                    # break multi-headlines into a line each
                    chunks = (phrase.strip() for line in lines
                              for phrase in line.split("  "))
                    # drop blank lines
                    text = '\n'.join(chunk for chunk in chunks if chunk)
                    #text = soup.get_text().strip()
                    title = soup.title.string
                    #print text
                doc = Document()
                doc.add(Field("link", url, t1))
                doc.add(Field("title", title, t1))
                doc.add(Field("text", text, t2))
                self.writer.addDocument(doc)
                print "index document", filename

        self.writer.commit()
        self.writer.close()
예제 #20
0
    def rebuildIndex(self, data):
        writer = IndexWriter(
            self.d, self.conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE))

        for record in data['records']:
            doc = self.buildDocument(data['fields'], record)
            writer.addDocument(doc)

        writer.commit()
        writer.close()
예제 #21
0
    def __init__(self, root, storeDir, analyzer):
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        writer.commit()
        writer.close()
예제 #22
0
class WikiPageIndex():
    def __init__(self, index_dir):
        #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g'])

        self.index_dir = index_dir
        self.directory = SimpleFSDirectory(File(self.index_dir))
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        self.searcher = IndexSearcher(DirectoryReader.open(self.directory))

    def createIndex(self):
        self.writer = IndexWriter(self.directory, self.config)

        if not os.path.exists(self.index_dir):
            os.mkdir(self.index_dir)

    def addDocumentToIndex(self, title, text):
        doc = Document()

        doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED))

        self.writer.addDocument(doc)

    def closeIndex(self):
        self.writer.commit()
        self.writer.close()


    def searchIndex(self, queryString, field="Text", max_results=100):
        query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(queryString)
        scoreDocs = self.searcher.search(query, max_results).scoreDocs
        log.debug("Found {0} documents for query [{1}]".format(len(scoreDocs), queryString))

        docs = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            log.debug(WikiPageIndex.cleanWikiText(doc.get("Text")))

            #print("title: {0}\ncontents: {1}".format(doc.get("Title"), doc.get("Text")[:70]))
            docs.append(doc)

        return docs

    @staticmethod
    def cleanWikiText(text):
        text = text.encode('ascii', 'ignore')
        text = re.sub('(\[\[.*?\]\]|\{\{.*?\}\}|\{\|.*?\|\})', '', text)
        text = re.sub('[^\na-zA-Z0-9\n_-]+', ' ', text)
        text = re.sub('([ \t]*[\n]+[ \t]*)+', '\n', text)
        return text.strip()
    def indexing(self):
        docs = self.text.load_seg_without_stopword_data()

        if (os.path.exists(self.index_dir)):
            r = input("Indexing Dir has existed! Continue indexing?")
            if (r.lower() != 'y'):
                return -1

        if (not os.path.exists(self.index_dir)):
            os.makedirs(self.index_dir)
        store = SimpleFSDirectory(Paths.get(self.index_dir))

        # todo
        # version.LUCENE_6_5_0
        # analyzer = CJKAnalyzer(CharArraySet.EMPTY_SET)
        # analyzer =SmartChineseAnalyzer()

        # analyzer = StandardAnalyzer(Version.LUCENE_6_5_0)
        # index_writer = IndexWriter(store,analyzer,True,IndexWriter.MaxFieldLength(512))
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        id_conf = FieldType()
        id_conf.setStored(True)
        id_conf.setTokenized(False)
        id_conf.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        date_conf = FieldType()
        date_conf.setStored(True)
        date_conf.setTokenized(True)
        date_conf.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        content_conf = FieldType()
        content_conf.setStored(True)
        content_conf.setTokenized(True)
        content_conf.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for n, i in enumerate(docs):
            document = Document()

            for key, content in i.items():
                if (key == 'PubDate'):
                    document.add(Field(key, content, date_conf))
                else:
                    document.add(Field(key, content, content_conf))
            document.add(Field('id', str(n), id_conf))
            writer.addDocument(document)
            if (n % 1000 == 0):
                print(n)

        writer.commit()
        writer.close()
예제 #24
0
 def dummyIndex(self):
     """
     Create a dummy index - to avoid problems updating it
     """
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(self.indexDir, config)
     doc = Document()
     doc.add(Field('uid', 'dummy', StringField.TYPE_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     return
예제 #25
0
def delete(indexDir: str, id: str):
    index_dir = SimpleFSDirectory(Paths.get(indexDir))
    config = IndexWriterConfig(StandardAnalyzer())

    index_writer = IndexWriter(index_dir, config)

    delete_term_query = RegexpQuery(Term('id', id))
    delete_reg_query = RegexpQuery(Term('id', id + '\..*'))

    index_writer.deleteDocuments(delete_term_query)
    index_writer.deleteDocuments(delete_reg_query)
    index_writer.commit()
    index_writer.close()
예제 #26
0
 def dummyIndex(self):
     """
     Create a dummy index - to avoid problems updating it
     """
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(self.indexDir, config)
     doc = Document()
     doc.add(Field('uid', 'dummy', StringField.TYPE_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     return
예제 #27
0
def create_miniindex(docs):
    index_store = RAMDirectory()
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index_store, config)

    for doc in docs:
        writer.addDocument(doc)

    writer.commit()
    writer.close()
    return index_store
class DocRepo:
    def __init__(self):
        # self.analyzer = StandardAnalyzer()
        # self.analyzer = PersianAnalyzer(StopFilter.makeStopSet(sw))
        # self.analyzer = PersianAnalyzer()
        self.analyzer = StopAnalyzer(Paths.get(Config.stop_words_address))
        self.config = IndexWriterConfig(self.analyzer)
        self.index = RAMDirectory()
        self.w = IndexWriter(self.index, self.config)

    def addDocument(self, id):
        global answers_train
        preA = answers_train[id]
        doc = Document()
        doc.add(TextField("pa", preA, Field.Store.YES))
        doc.add(StringField("id", str(id), Field.Store.YES))
        self.w.addDocument(doc)
        self.w.commit()

    def __del__(self):
        self.w.close()

    def get_most_similar(self, sentence, do_log=False):
        # print('query string is',string)
        # q = QueryParser('pa', self.analyzer).parse(sentence)
        query_builder = BooleanQuery.Builder()
        for token in sentence.split(' '):
            if token not in sw:
                qtq = TermQuery(Term("pa", token))
                query_builder.add(
                    BooleanClause(qtq, BooleanClause.Occur.SHOULD))
        q = query_builder.build()
        hitsPerPage = 2
        reader = DirectoryReader.open(self.w)
        self.searcher = IndexSearcher(reader)
        simi = BM25Similarity(Config.k1, Config.b)
        # simi = ClassicSimilarity()
        self.searcher.setSimilarity(simi)

        docs = self.searcher.search(q, hitsPerPage)
        hits = docs.scoreDocs

        # print("Found " + str(len(hits)) + " hits.")
        if len(hits) > 0:
            mate = self.searcher.doc(hits[0].doc).get("id")
            if do_log:
                print("found something. mate: ", mate, "- score : ",
                      hits[0].score)
            return hits[0], int(mate)
        else:
            return None, -1
예제 #29
0
def main():
    facts = get_all_facts()
    print("Preparing to index {} facts".format(len(facts)))

    store_dir = "lucene_index"
    store = SimpleFSDirectory(Paths.get(store_dir))
    analyzer = EnglishAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = IndexWriter(store, config)
    index_facts(facts, writer)
    writer.commit()
    writer.close()
    print("Lucene index created at: {}".format(store_dir))
예제 #30
0
 def buildIndex(self, inputFile):
     analyzer = self.getAnalyzer()
     iwconf = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
     
     iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter( SimpleFSDirectory( File(self.luceneDir) ), iwconf)
     
     # read through input file and write out to lucene
     counter = 0
     linesReadCounter = 0
     
     with open(inputFile, 'r') as lines:
         linesRead = 0
         
         for line in lines:
             try:
                 linesRead+=1
                 
                 if linesRead % 1000 == 0:
                     print "%d lines read" % linesRead
                     
                 cui, concept = line.replace("\",\"", "\t").replace("\"", "").split("\t")
                 concept = concept.strip()
                 cui = cui.strip()
                 
                 strNorm = self.normalizeCasePunct(concept)
                 strSorted = self.sortWords(strNorm)
                 strStemmed = self.stemWords(strNorm)
                 strStemmedSorted = self.stemWords(strSorted)
       
                 fdoc = Document()
                 
                 counter +=1
                 fid = counter
                 
                 fdoc.add( Field("id", unicode(fid), Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("cui", cui, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str", concept, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_norm", strNorm, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_sorted", strSorted, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_stemmed", strStemmed, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_stemmedSorted", strStemmedSorted, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 writer.addDocument(fdoc)
                 if fid % 1000 == 0:
                     writer.commit()
             except:
                 "Skipping line: %s" % line
                 
     writer.commit()
     writer.close()
예제 #31
0
def main():
    try:
        print "Indexing..."
        #########################################  경   로  ####################################
        indexDestination = File(
            "/Users/Falcon/Desktop/New_Indices/Stack_A_Indices")

        #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": analyzer,
            "extends": analyzer,
            "used_classes": analyzer,
            "methods": analyzer,
            "class_instance_creation": analyzer,
            "methods_called": analyzer,
            "view_count": KeywordAnalyzer(),
            "code_hints": JavaCodeAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

        # analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        # a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(),
        # 	 "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
        # 	 "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(),
        # 	 "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()}
        # wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        # config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        # writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

        counter = Counter()
        index_code_snippet(writer, counter)
        writer.commit()
        writer.close()

        print "Done"
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
    except SQLException as e:  #when Database error occurs
        e.printStackTrace()
예제 #32
0
def commit(collection_name):
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #setting writer configurations
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = IndexWriter(direc, config)

    writer.commit()
    writer.close()
def commit(collection_name):
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT

	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)

	#setting writer configurations
	config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
	config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
	writer=IndexWriter(direc,config)

	writer.commit()
	writer.close()
예제 #34
0
 def __init__(self, root, storeDir, analyzer): 
     if not os.path.exists(storeDir): 
         os.mkdir(storeDir) 
     store = SimpleFSDirectory(File(storeDir)) 
     analyzer = LimitTokenCountAnalyzer(analyzer, 1000)#1048576 
     config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) 
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) 
     writer = IndexWriter(store, config) 
     self.indexDocs(root, writer) 
     ticker = Ticker() 
     print 'commit index', 
     threading.Thread(target=ticker.run).start() 
     writer.commit() 
     writer.close() 
     ticker.tick = False 
     print 'done'
예제 #35
0
def delete(primary_keys_map, collection_name, todelete, commit=False):
    INDEX_DIR_DEFAULT = "IndexFiles.index"
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    try:
        tofind_keyvalue_pairs = json.loads(todelete)
    except:
        return 100

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #setting writer configurations
    try:
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(direc, config)
        ireader = IndexReader.open(direc)
    except:
        return 105

    ###as of now deletion of documents support is only based on indexed keys.###################3
    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents according to primary keys
    query = BooleanQuery()
    for key in tofind_primary_keyvalue_pairs.keys():
        temp = QueryParser(Version.LUCENE_CURRENT, key,
                           analyzer).parse(tofind_primary_keyvalue_pairs[key])
        query.add(BooleanClause(temp, BooleanClause.Occur.MUST))

    a = writer.deleteDocuments(query)
    if commit == True:
        writer.commit()
    writer.close()
    return 000
def delete(primary_keys_map,collection_name,todelete,commit=False):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT

	try:
		tofind_keyvalue_pairs=json.loads(todelete)
	except:
		return 100	
	

	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)

	#setting writer configurations
	try:
		config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer=IndexWriter(direc,config)
		ireader=IndexReader.open(direc)
	except:
		return 105

	###as of now deletion of documents support is only based on indexed keys.###################3 
	tofind_primary_keyvalue_pairs={}
	tofind_nonprimary_keyvalue_pairs={}

	#separating out primary and non_primary keys
	for key in tofind_keyvalue_pairs.keys():
		if key in primary_keys_map:
			tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]
		else:
			tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]

	#filtering documents according to primary keys		
	query=BooleanQuery()
	for key in tofind_primary_keyvalue_pairs.keys():
		temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key])
		query.add(BooleanClause(temp,BooleanClause.Occur.MUST))

	a=writer.deleteDocuments(query)
	if commit==True:
		writer.commit()
	writer.close()
	return 000;
예제 #37
0
    def __init__(self, root, storeDir, analyzer):
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print('commit index', )
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
예제 #38
0
class IndexFiles(object):
    def __init__(self, root, analyzer):
        self.store = RAMDirectory()
        self.analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(self.store, config)
        self.numDocs = self.indexDocs(root, self.writer)
        self.writer.commit()
        self.writer.close()

    def indexDocs(self, root, writer):
        path = root + "/data/*/*.xml"
        # print(path)
        xml_files = glob.glob(path)
        # xml_files = ["HAM2-031201.xml"]
        numDocs = 0
        for xml in xml_files:
            try:
                parser = etree.XMLParser(recover=False, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            except etree.XMLSyntaxError as e:
                parser = etree.XMLParser(recover=True, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            root = tree.getroot()
            for text in root.iter("TEXT"):
                contents = "".join(text.xpath("text()")).strip()
                doc_no = text.getparent().find("DOCNO").text
                # print("adding", doc_no)
                try:
                    doc = Document()
                    doc.add(StringField("id", doc_no, Field.Store.YES))
                    if len(contents) > 0:
                        doc.add(
                            TextField("contents", contents, Field.Store.YES))
                    else:
                        pass
                        # print("warning: no content in %s" % doc_no)
                    writer.addDocument(doc)
                    numDocs += 1
                except Exception as e:
                    print("Failed in indexDocs:", e)
        return numDocs
예제 #39
0
def index_wiki(wiki_xmlfile, index_directory_name):
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)

    writer.commit()
    writer.close()
def indexing():
    print("建立索引,文本文件夹 [%s] ..." % TEXT_DIR)
    create_dir(INDEX_DIR)
    directory = SimpleFSDirectory(Paths.get(INDEX_DIR))
    config = IndexWriterConfig(ANALYZER)
    writer = IndexWriter(directory, config)

    for x in glob.glob(os.path.join(TEXT_DIR, "*.txt")):
        title, post, terms = get_terms(x)
        doc = Document()
        if terms:
            doc.add(Field("title", title, TextField.TYPE_STORED))
            doc.add(Field("post", post, TextField.TYPE_STORED))
            doc.add(Field("terms", terms, TextField.TYPE_STORED))
            writer.addDocument(doc)

    writer.commit()
    writer.close()
예제 #41
0
    def __init__(self, storeDir, analyzer, function):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)
        eval("self." + function + "(writer)")
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print '索引建立完成.'
    def __init__(self, destination_directory, analyzer):

        if not os.path.exists(destination_directory):
            os.mkdir(destination_directory)

        store = SimpleFSDirectory(File(destination_directory))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.tweetIndexer(writer)
        ticker = Ticker()
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
예제 #43
0
파일: IndexFiles.py 프로젝트: w2wei/XPRC
    def __init__(self, root, storeDir, analyzer):
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir)) # Store index files in the file syste. try NIOFSDirectory
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) # maxTokenCount=1048576, this analyzer limit the number of tokens per field, not necessary for indexing MEDLINE
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
예제 #44
0
    def __init__(self, fileRoot, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store    = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config   = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setSimilarity(similarities.BM25Similarity())
    #Available similarity: BM25Similarity, MultiSimilarity, PerFieldSimilarityWrapper, SimilarityBase, TFIDFSimilarity
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer   = IndexWriter(store, config)

        self.indexDocs(fileRoot, writer)
        print 'commit index',
        writer.commit()
        writer.close()
        print 'done'
예제 #45
0
def index_wiki(wiki_xmlfile, index_directory_name):
    lucene.initVM()
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    
    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)
     
    writer.commit()
    writer.close()
예제 #46
0
def main():
    lucene.initVM()
    print 'lucene version ', lucene.VERSION
    version = Version.LUCENE_CURRENT
    index_store = SimpleFSDirectory(File(index_path))
    # analyzer = StandardAnalyzer(version)
    analyzer = PorterStemmerAnalyzer()
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index_store, config)

    start = dt.now()
    indexCranFull(document_path, writer)
    writer.commit()
    writer.close()
    end = dt.now()

    print 'elapsed time for indexing documents:'
    print end - start
예제 #47
0
 def updateDeleteRec(self, pid1, pid2, personDB, familyDB, relationDB):
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
     writer = IndexWriter(self.indexDir, config)
     mt = matchtext()
     writer.deleteDocuments(Term('uid', pid1))
     writer.deleteDocuments(Term('uid', pid2))
     p = personDB.find_one({'_id': pid1})
     matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
     doc = Document()
     doc.add(Field('uid',str(pid1), StringField.TYPE_STORED))
     doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED))
     doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
     doc.add(Field("text", mt.luceneFix(self.personText(p)), TextField.TYPE_NOT_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     return
예제 #48
0
class Indexer:
	
	def __init__(self, writerConfig, indexDir):
		
		lucene.initVM()

		self.mIndexDir = SimpleFSDirectory(File(indexDir))
		self.mConfig = writerConfig
		self.mWriter = IndexWriter(self.mIndexDir, self.mConfig)
	

	def index(self, root):

		t = FieldType()
		t.setIndexed(True)
		t.setStored(True)
		t.setTokenized(True)
		t.setStoreTermVectors(True)
		
		for path, dirs, files in os.walk(root):
			
			for file in files:
				
				filePath = os.path.join(path, file)
				fd = open(filePath)
				content = unicode(fd.read(), 'iso-8859-1')
				fd.close()
				
				doc = Document()
				doc.add(Field('name', file, StringField.TYPE_STORED))

				parent = os.path.split(path)[1]
				doc.add(Field('parent', parent, StringField.TYPE_STORED))

				if len(content) > 0:
					doc.add(Field('content', content, t))

				print 'Indexing %s' % file
				self.mWriter.addDocument(doc)

		self.mWriter.commit()
		self.mWriter.close()
def indexContent(indexDir):
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
    """ Pegando direitorio passado pelo parametro"""
    directory = SimpleFSDirectory(File(indexDir)) 
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    writer = IndexWriter(directory, config)
    
    manpath = os.environ.get('MANPATH', '/home/massilva/Documentos/Ogri/Codigo/information_retrieval_20132/equipe_1/jsons/geral/').split(os.pathsep)

    for dir in manpath:
        print "Crawling", dir
        for name in os.listdir(dir):
            path = os.path.join(dir, name)
            if os.path.isdir(path):
                indexDirectory(path,writer)
    writer.commit()
    writer.close()
예제 #50
0
 def __init__(self, **kwargs):
     xmlpath = kwargs.get('xmlpath')
     storeDir = kwargs.get('storeDir')
     analyzer = kwargs.get('analyzer')
     ItemClass = kwargs.get('ItemClass')
     if not os.path.exists(storeDir):
         os.mkdir(storeDir)
     store = SimpleFSDirectory(File(storeDir))
     analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
     config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(store, config)
     # self.indexDocs(xmlpath, writer)
     self.indexXML(xmlpath, writer, ItemClass)
     ticker = Ticker()
     print 'commit index',
     threading.Thread(target=ticker.run).start()
     writer.commit()
     writer.close()
     ticker.tick = False
     print 'done'
예제 #51
0
 def __index(self, emailInfo):
     from org.apache.lucene.index import IndexWriterConfig
     from org.apache.lucene.util import Version
     from org.apache.lucene.analysis.standard import StandardAnalyzer
     analyser = StandardAnalyzer(Version.LUCENE_33)
     conf = IndexWriterConfig(Version.LUCENE_33, analyser)
     from org.apache.lucene.store import FSDirectory
     from java.io import File
     storage = File.createTempFile(u'Tubelight-', '.index')
     storage.delete()
     storage.mkdir()
     storage.deleteOnExit()
     self.storage = storage.getAbsolutePath()
     from java.io import File
     self.session.setAttribute('directory', storage.getAbsolutePath()+File.separator+'mail.idx')
     directory = FSDirectory.open(storage)
     from org.apache.lucene.index import IndexWriter
     iw = IndexWriter(directory, conf)
     from us.d8u.tubelight import Configuration
     addr = emailInfo[Configuration.EmailAddressKey]
     (username, server) = addr.split('@')
     from java.lang import System
     System.setProperty("mail.imap.partialfetch", "false")
     urlPrefix = (("imap://%s@%s:%d/Inbox") % (username, server, int(emailInfo[Configuration.EmailPortKey])))
     from javax.mail import Session
     session = Session.getDefaultInstance(System.getProperties(), None).getStore(h.get(Configuration.EmailProtocolKey))
     session.connect(server, username, emailInfo[Configuration.EmailPasswordKey])
     folder = session.getDefaultFolder()
     for m in folder.getMessages():
         from org.apache.lucene.document import Document
         d = Document()
         subject = Field("subject", m.getSubject(), Field.Store.YES, Field.Index.ANALYZED)
         toSrc = u''
         toSrc = [((u'%s, %s') % (toSrc, str(r))) for r in m.getAllRecipients()]
         to = Field("to", toSrc.substring(toSrc.indexOf(u',')), Field.Store.YES, Field.Index.ANALYZED)
         d.add(to)
         d.add(subject)
         iw.addDocument(d)
     iw.commit()
     self.searcher = IndexSearcher(directory)
예제 #52
0
    def __init__(self, storeDir, aWrapper):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        aWrapper = LimitTokenCountAnalyzer(aWrapper, 1048576)
        bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75.
        config = IndexWriterConfig(Version.LUCENE_CURRENT, aWrapper)
        config.setSimilarity(bm25Sim)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)


        self.indexTable(writer)
        ticker = Ticker()
        print 'commit index'
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
예제 #53
0
    def index(self, personDB, familyDB, relationDB):
        """
        indexes a database
        Field match includes information about parents and is used to find matches
        Field text has Ids, names, places, and dates and is used to find a person/family
        """
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(self.indexDir, config)
        #indexWriter.setRAMBufferSizeMB(256)  #?

        mt = matchtext()

        for p in personDB.find({}, no_cursor_timeout=True):
            matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
            doc = Document()
            doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED))
            doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED))
            doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
            doc.add(Field("text", mt.luceneFix(self.personText(p)), TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)

        #Family matchtext
        for f in familyDB.find():
            #matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB)
            doc = Document()
            doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED))
            #doc.add(Field('sex','FAM', StringField.TYPE_STORED))
            #doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
            txt = f['_id']
            if 'refId' in f: txt += ' ' + f['refId']
            doc.add(Field("text", txt, TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)

        writer.commit()
        writer.close()
        return
def store(primary_keys_map,to_be_compressed_input,collection_name,data,commit=False):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT	
	print "started indexing input data......"
	
	#extracting values
	try:
		contents=json.loads(data)
	except:
		return 100


	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)
	

	#checking for existance of record with same primary_key set
	try:
		ireader=IndexReader.open(direc)	
		searcher=IndexSearcher(ireader)
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(contents[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query,MAX_RESULTS).scoreDocs
		if len(hits) > 0:
			return 106
	except:
		pass 	 
	
	


	#setting writer configurations
	config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
	config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
	writer=IndexWriter(direc,config)
	#fix this later.....FieldType not defined
	#field_type=FieldType()
	#field_type.setIndexed(True)
	#field_type.setStored(False)
	#field_type.setTokenized(False)
	
	try:
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,contents[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			data=snappy.compress(data)
		field=Field("$DATA$",data,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)
		if commit==True:
			writer.commit()
		writer.close()
		return 000
	except:
		return 102
예제 #55
0
class IndexingEngine():

	def __init__(self):

		self.mDocumentDirectory = settings.ADMINS_ENGINE.mDocumentDirectory
		self.mIndexDirectory = settings.ADMINS_ENGINE.mIndexDirectory
		self.mAnalyzers = settings.ADMINS_ENGINE.getIndexingAnalyzers()


		############################# Writer Configurattion #####################################
		map = HashMap()
		map.put('name', self.mAnalyzers['name'])
		map.put('parent', self.mAnalyzers['parent'])
		map.put('content', self.mAnalyzers['default'])
		map.put('id', self.mAnalyzers['id'])		

		analyzerWrapper = PerFieldAnalyzerWrapper(self.mAnalyzers['default'], map)

		self.mWriterConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzerWrapper)
		self.mWriterConfig.setOpenMode(settings.ADMINS_ENGINE.mOpenMode)

		if settings.ADMINS_ENGINE.mSimilarity != None:
			self.mWriterConfig.setSimilarity(settings.ADMINS_ENGINE.mSimilarity)
		########################################################################################


		directory = SimpleFSDirectory(File(self.mIndexDirectory))
		self.mIndexWriter = IndexWriter(directory, self.mWriterConfig)


		############################# FieldType Prepration #####################
		nameField = FieldType()
		nameField.setIndexed(True)
		nameField.setStored(True)
		nameField.setTokenized(True)
		nameField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)

		parentField = FieldType()
		parentField.setIndexed(True)
		parentField.setStored(True)
		parentField.setTokenized(True)
		parentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)

		contentField = FieldType()
		contentField.setIndexed(True)
		contentField.setStored(True)
		contentField.setTokenized(True)
		contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

		idField = FieldType()
		idField.setIndexed(True)
		idField.setStored(True)
		idField.setTokenized(False)
		idField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)


		self.mFieldTypes = {
			'name' 		: nameField,
			'parent'	: parentField,
			'content'	: contentField,
			'id'		: idField
		}
		#######################################################################

		self.mLog = ""

	

	def indexing(self, root = settings.ADMINS_ENGINE.mDocumentDirectory, parent = [], docID = 1, parentID = 0, id = 0):

		realPath = os.path.abspath(root)
		for i in os.listdir(realPath):

			path = os.path.join(realPath, i)
			if os.path.isfile(path):
				#index this file
				doc = Document()

				doc.add(Field('name', ("%s %s" % (' '.join(parent), i)).strip(), self.mFieldTypes['name']))
				doc.add(Field('parent', ' '.join(parent), self.mFieldTypes['parent']))
				doc.add(Field('id', str(docID), self.mFieldTypes['id']))
				doc.add(Field('parentID', str(parentID), self.mFieldTypes['id']))

				fd = open(path, 'r')
				content = fd.read()
				fd.close()

				if len(content) > 0:
					doc.add(Field('content', content, self.mFieldTypes['content']))

				self.mIndexWriter.addDocument(doc)
				##################### Logging ##############################
				if IS_DEBUG:
					nameDebug = AnalyzerDebug.debug(self.mAnalyzers['name'], ("%s %s" % (' '.join(parent), i)).strip())
					parentDebug = AnalyzerDebug.debug(self.mAnalyzers['parent'], ' '.join(parent))
					contentDebug = AnalyzerDebug.debug(self.mAnalyzers['default'], content)
					self.mLog = self.mLog + ( "File %s\n   {name - %s}: %s\n   {parent - %s}: %s\n   {content}: %s\n\n" % (path, docID, nameDebug, parentID, parentDebug, contentDebug) )



				docID = docID + 1
				################### index sub commands	
				if os.path.isdir(path + ".sub"):
					parent.append(i)
					docID = self.indexing(path + ".sub", parent, docID, docID - 1, id + 1)
					parent.pop()
					
						
		
		if id == 0:
			self.mIndexWriter.commit()
			self.mIndexWriter.close()
			
			if IS_DEBUG:
				loggingBot = LoggingBot(self.mLog, settings.ADMINS_ENGINE.getIndexingLogQueue())
				loggingBot.start()
				self.mLog = ""
		return docID
예제 #56
0
파일: views.py 프로젝트: kevkid/YIF
def survey(request):
    ipAddr = get_client_ip(request)
    instances = (Classes.objects.values_list('image_class_desc'))
    instances = [i[0] for i in instances]
    #cnt = len(instances)
    #lets get out choice
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    
        
    try:
        #image_class = image.objects.get(pk=request.POST['survey'])
        s = request.POST['survey']#get from post
        
                
    except (KeyError, Classes.DoesNotExist):
        return render(request, 'web/index.html',{
            'error_message': "You didn't select a choice.",
        })
    else:
        image_class = instances[int(s)]
        docNum = request.POST['imageID']#get document id
        doc = reader.document(int(docNum))
        fname = doc.get("filename")
        print(fname)
        #SimpleFSDirectory(File(location)).clearLock(IndexWriter.WRITE_LOCK_NAME);
        fileClassField = doc.get("Classification")
        if str(fileClassField) == "None":#check if the field exists####NEED TO CHECK THIS
            fileClassField = str(ipAddr + ":" + image_class)#I think we must add an ip address to this
        else:
            fileClassField = str(ipAddr + ":" + fileClassField) + ", " + image_class
            
        #doc.removeField("Classification")
        
        #doc.add(StringField("Classification", fileClassField, Field.Store.YES))
        #t = doc.get("Classification")
        #reader.close()
        indexDir = SimpleFSDirectory(File(location))
        writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
        writer = IndexWriter(indexDir, writerConfig)
        fields = doc.getFields()#get all fields
        doc2 = Document()
        classificationFieldFlag = False
        for f in fields:
            field = Field.cast_(f)
            (k, v) = field.name(), field.stringValue()
            if k == "Classification":
                classificationFieldFlag = True
                field = StringField("Classification", fileClassField, Field.Store.YES)
                doc2.add(field)
            else:
                doc2.add(field)

        if classificationFieldFlag == False:#this does not exist in the document must add
            doc2.add(StringField("Classification", fileClassField, Field.Store.YES))
#         doc2.add(StringField("Classification", fileClassField, Field.Store.YES))
#         doc2.add(StringField("fid", doc.get("fid"), Field.Store.YES))
#         doc2.add(StringField("articleid", doc.get("articleid"), Field.Store.YES))
#         doc2.add(StringField("caption", doc.get("caption"), Field.Store.YES))
#         doc2.add(StringField("figureid", doc.get("figureid"), Field.Store.YES))
#         doc2.add(StringField("filename", doc.get("filename"), Field.Store.YES))
#         doc2.add(StringField("filepath", doc.get("filepath"), Field.Store.YES))
#         doc2.add(StringField("label", doc.get("label"), Field.Store.YES))
        
        #writer.updateDocument(Term("fid","f000000000023"), doc2)#If field exists update
        writer.updateDocument(Term("fid", doc.get("fid")), doc2)#If field exists update
        writer.commit();
        #writer.optimize()
        writer.close()
        #writer.unlock(SimpleFSDirectory(File(location)))
        
    return HttpResponseRedirect(reverse('web:index', args=()))
def update(primary_keys_map,to_be_compressed_input,collection_name,tofind,update,commit=False,add_field_if_not_exists=True):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	#As of now the update will be implemented as search,modify data in json file,delete and re-write
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT
	try:
		tofind_keyvalue_pairs=json.loads(tofind)
	except:
		return 100	
	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)
	try:
		ireader=IndexReader.open(direc)	
		searcher=IndexSearcher(ireader)
		#setting writer configurations
		config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer=IndexWriter(direc,config)
	except:
		return 105
	no_of_documents_modified=0	
	#finding the document to update
	#Scope for making this more efficient
	def rewrite(data_string):
		data=json.loads(data_string)
		toupdate=json.loads(update)
		#primary_key_modified=False

		#delete the appropriate document
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		

		#modify the values
		for key,value in toupdate.items():
			#if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)	
			if add_field_if_not_exists==False:
				if key in data.keys():
					data[key]=value
			else:		
				data[key]=value

		#this deletion statement has been intenstionally added here		
		#only if the modified data,has primary keys already not existing,will the updating process continue
		query_search=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
			query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query_search,MAX_RESULTS).scoreDocs
		if len(hits) > 0:
			return 106			
		writer.deleteDocuments(query)

		#add the newly modified document
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			data_string=snappy.compress(str(json.dumps(data)))
		else:
			data_string=json.dumps(data)	
		field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)

	tofind_primary_keyvalue_pairs={}
	tofind_nonprimary_keyvalue_pairs={}

	#separating out primary and non_primary keys
	for key in tofind_keyvalue_pairs.keys():
		if key in primary_keys_map:
			tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]
		else:
			tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]

	#filtering documents		
	if len(tofind_primary_keyvalue_pairs)>0:		
		query=BooleanQuery()
		for key in tofind_primary_keyvalue_pairs.keys():
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query,MAX_RESULTS).scoreDocs
		
		for hit in hits:
			doc=searcher.doc(hit.doc)
			if to_be_compressed_input==True:
				data=snappy.uncompress(doc.get("$DATA$"))
			else:
				data=doc.get("$DATA$")
			#non primary key filtering(without having to load all the primary key filtered values into main memory!)	
			if len(tofind_nonprimary_keyvalue_pairs)>0:
				entry=json.loads(data)
				satisfied=True
				for key in tofind_nonprimary_keyvalue_pairs.keys():
					if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]:
						satisfied=False
						break
				if satisfied==True:
					if rewrite(data)!=106:
						no_of_documents_modified+=1
					else:
						writer.rollback()
						return 106	
			else:
				if rewrite(data)!=106:
					no_of_documents_modified+=1
				else:
					writer.rollback()
					return 106
				
			
	else:
		for i in range(0,ireader.numDocs()):
			doc=searcher.doc(i)
			if to_be_compressed_input==True:
				data=snappy.uncompress(doc.get("$DATA$"))
			else:
				data=doc.get("$DATA$")
			#non primary key filtering(without having to load all the primary key filtered values into main memory!)	
			if len(tofind_nonprimary_keyvalue_pairs)>0:
				entry=json.loads(data)
				satisfied=True
				for key in tofind_nonprimary_keyvalue_pairs.keys():
					if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]:
						satisfied=False
						break
				if satisfied==True:
					if rewrite(data)!=106:
						no_of_documents_modified+=1
					else:
						writer.rollback()
						return 106
			else:
				if rewrite(data)!=106:
					no_of_documents_modified+=1
				else:
					writer.rollback()
					return 106
			
	
	ireader.close()
	if commit==True:
			writer.commit()
	writer.close()
	return str(no_of_documents_modified)+" have been modified"
예제 #58
0
ft = FieldType()
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

ts = ["this bernhard is the text to be index text",
      "this claudia is the text to be indexed"]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t, ft))
    iwriter.addDocument(doc)

iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)

for doc in xrange(0, len(ts)):
    tv = ireader.getTermVector(doc, "fieldname")
    termsEnum = tv.iterator()

    for term in BytesRefIterator.cast_(termsEnum):
        dpEnum = termsEnum.postings(None)
        dpEnum.nextDoc()  # prime the enum which works only for the current doc
        freq = dpEnum.freq()

        print 'term:', term.utf8ToString()
        print '  freq:', freq
예제 #59
0
파일: index.py 프로젝트: asxzy/weiso
INDEX_DIR = 'index'
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
print "init lucene"
directory = SimpleFSDirectory(File(INDEX_DIR))
analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT)
analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
writer = IndexWriter(directory, config)


print 'doing index'
count = 0
for node in db.nodes.find({},{"node_id","screen_name","description"}):
    count += 1
    if count % 100000 == 0:
        writer.commit()
        print count
    doc = Document()
    string = str(node["node_id"]) + ' '
    try:
        string += node["screen_name"] + ' '
    except KeyError:
        string += ''
    try:
        string += node["description"] + ' '
    except KeyError:
        string += ''
    doc.add(Field("text", string , TextField.TYPE_NOT_STORED))
    doc.add(Field("id", str(node["node_id"]), StringField.TYPE_STORED))
    writer.addDocument(doc)
writer.commit()