Exemplo n.º 1
0
def build_index():
    f = open(data_dir)
    reader = csv.reader(f)

    print("开始创建索引")

    indx = 0

    writer = lucene.IndexWriter(directory, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength.UNLIMITED)

    for line in reader:
        eng, zh = line[0], line[1]

        doc = lucene.Document()

        doc.add(
            lucene.Field('eng', eng, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED))
        doc.add(
            lucene.Field('zh', zh, lucene.Field.Store.YES,
                         lucene.Field.Index.NOT_ANALYZED))

        writer.addDocument(doc)

        if indx % 100000 == 0:
            print("%sK" % (indx / 1000))

        indx += 1

    print("写引擎优化")
    writer.optimize()
    writer.close()
Exemplo n.º 2
0
 def testAdd(self, filepath):
     writer = lucene.IndexWriter(self.dir, self.getAnalyzer(), False,
                                 lucene.IndexWriter.MaxFieldLength.UNLIMITED)
     #True,建立新索引,False,建立增量索引
     file = open(filepath)
     contents = unicode(file.read(), 'gbk')
     file.close()
     doc = lucene.Document()
     doc.add(lucene.Field("name", os.path.basename(filepath),
                          lucene.Field.Store.YES,
                          lucene.Field.Index.NOT_ANALYZED))
     doc.add(lucene.Field("path", filepath,
                          lucene.Field.Store.YES,
                          lucene.Field.Index.NOT_ANALYZED))
     if len(contents) > 0:
         title = self.getTxtAttribute(contents, 'Title')
         author = self.getTxtAttribute(contents, 'Author')
         language = self.getTxtAttribute(contents, 'Language')
         doc.add(lucene.Field("Title", title,
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED))
         doc.add(lucene.Field("Author", author,
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED))
         doc.add(lucene.Field("Language", language,
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED))
         doc.add(lucene.Field("contents", contents,
                              lucene.Field.Store.NO,
                              lucene.Field.Index.ANALYZED))
     else:
         print "warning: no content in %s" % filename
     writer.addDocument(doc)
     writer.optimize()
     writer.close()
Exemplo n.º 3
0
    def UpdateIndex(self):
        # 인덱스를 최신 내용으로 갱신
        writer = lucene.IndexWriter(self.indexDir, self.analyzer, True,
                                    lucene.IndexWriter.MaxFieldLength(512))

        try:
            # DB에서 내용 가져오기
            for row in self.rows:
                doc = lucene.Document()

                doc.add(
                    lucene.Field("bookUrl", row[0], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("thumbUrl", row[1], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("price", row[2], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("title", row[3], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                doc.add(
                    lucene.Field("subTitle", row[4], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("author", row[5], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                doc.add(
                    lucene.Field("publisher", row[6], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                doc.add(
                    lucene.Field("publishDate", row[7], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("offcode", row[8], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                date = str(row[9]).split('-')
                date = ''.join(date)
                print 'regDate : ' + date + ' ' + str(type(date))
                doc.add(
                    lucene.Field("regDate", date, lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))

                date = str(row[10]).split('-')
                date = ''.join(date)
                print 'updateDate : ' + date
                doc.add(
                    lucene.Field("updateDate", date, lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))

                writer.addDocument(doc)
        except Exception, e:
            print "Failed in adding index : %s" % e
            exit(1)
Exemplo n.º 4
0
 def __init__(self, network):
     self.network = network
     smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
     #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33)
     analyzers = {"smartcn": smartcn}
     self.pgconn = mypass.getConn()
     writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33,
                                             analyzers["smartcn"])
     writerconfig.setWriteLockTimeout(600000L)
     writerconfig.setMaxThreadStates(50)
     writerconfig.setRAMBufferSizeMB(128.0)
     self.storeDir = self.storeDirBase + self.network
     store = lucene.SimpleFSDirectory(lucene.File(self.storeDir))
     self.writer = lucene.IndexWriter(store, writerconfig)
Exemplo n.º 5
0
	def __init__(self, root, storeDir, analyzer):
		if not os.path.exists(storeDir):
			os.mkdir(storeDir)
		store = lucene.SimpleFSDirectory(lucene.File(storeDir))
		writer = lucene.IndexWriter(store, analyzer, True, lucene.IndexWriter.MaxFieldLength.LIMITED)
		writer.setMaxFieldLength(1048576)
		self.indexDocs(root, writer)
		ticker = Ticker()
		print 'optimizing index',
		threading.Thread(target=ticker.run).start()
		writer.optimize()
		writer.close()
		ticker.tick = False
		print 'done'
Exemplo n.º 6
0
    def __init__(self):
	smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
	#analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33)
	analyzers = { "smartcn": smartcn }
	self.pgconn = mypass.getConn()
	self.sw = sinaweibooauth.SinaWeiboOauth()
	if not os.path.exists(self.storeDir):
	    os.mkdir(self.storeDir)
	store = lucene.SimpleFSDirectory(lucene.File(self.storeDir))
	writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"])
	writerconfig.setWriteLockTimeout(600000L)
	writerconfig.setMaxThreadStates(50)
	writerconfig.setRAMBufferSizeMB(128.0)
	self.writer = lucene.IndexWriter(store, writerconfig)
Exemplo n.º 7
0
 def __init__(self, index_dir):
     '''
     Initialises index parameters
     '''
     lucene.initVM()
     self.index_dir = index_dir
     if not os.path.exists(self.index_dir):
         os.mkdir(self.index_dir)
     store = lucene.SimpleFSDirectory(lucene.File(self.index_dir))
     self.analyser = PorterStemmerAnalyzer()
     self.writer = lucene.IndexWriter(
         store, self.analyser, True,
         lucene.IndexWriter.MaxFieldLength.LIMITED)
     self.writer.setMaxFieldLength(1048576)
     directory = lucene.SimpleFSDirectory(lucene.File(self.index_dir))
     self.reader = lucene.FilterIndexReader.open(directory, True)
Exemplo n.º 8
0
def index_files(board, time_delta):
    store = lucene.SimpleFSDirectory(
        lucene.File(BOARDSPATH + board + '/' + RECENT_INDEX))
    writer = lucene.IndexWriter(
        store, lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), True,
        lucene.IndexWriter.MaxFieldLength.UNLIMITED)
    #  writer.setMaxFieldLength(1048576) # 1MB

    flist = get_all_files(board, time_delta)
    for filename, owner, title in flist:
        path = BOARDSPATH + board + '/' + filename
        if not os.path.exists(path):
            continue

        f = open(path, 'r')
        contents = filter_file(f)
        debug(contents)
        try:
            title = title.decode('gbk')
            owner = owner.decode('gbk')
            contents = unicode(contents, 'gbk')
        except UnicodeDecodeError:
            f.close()
            debug(filename)
            continue
        f.close()

        if len(contents) > 0:
            doc = lucene.Document()
            doc.add(
                lucene.Field("name", filename, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("owner", owner, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("title", title, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("contents", contents, lucene.Field.Store.NO,
                             lucene.Field.Index.ANALYZED))
            writer.addDocument(doc)
            debug('adding ' + filename)
    writer.optimize()
    writer.close()
Exemplo n.º 9
0
def get_word_list(text, is_list=False, field_name = 'fieldname'):
    if is_list:
        new_text = ""
        for i in text:
            new_text += i + "\n"
        text = new_text

    lucene.initVM(lucene.CLASSPATH)
    analyzer = lucene.KoreanAnalyzer();

    #directory = lucene.FSDirectory.open("/tmp/testindex");
    directory = lucene.RAMDirectory()

    # writer
    writer = lucene.IndexWriter(directory, analyzer)
    doc = lucene.Document()

    doc.add(lucene.Field(field_name, text, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED));
    writer.addDocument(doc);
    writer.close();

    # get all terms from all index
    ireader = lucene.IndexReader.open(directory, False)
    term = lucene.Term(field_name, '')
    termenum = ireader.terms(term)
    term = termenum.term()
    i = 0

    word_list = []

    while term and term.field() == field_name:
        i += 1
        termDocs = ireader.termDocs(term)
        termDocs.next()
        #print "[%04d]===> <%s> " % (i, term.text())
        #print term.text() + " : " + str(termDocs.freq())
        word_list.append({'text': term.text(), 'freq': termDocs.freq()})
        term = termenum.next() and termenum.term()

    ireader.close();
    directory.close();

    return word_list
Exemplo n.º 10
0
    def __init__(self, root, storeDir, analyzer, startDate, endDate):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = lucene.SimpleFSDirectory(lucene.File(storeDir))
        # 创建IndexWriter对象,第一个参数是Directory,第二个是分词器,
        # 第三个表示是否是创建,如果为false为在此基础上面修改,
        # 第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,一般用IndexWriter.MaxFieldLength.LIMITED
        writer = lucene.IndexWriter(store, analyzer, False,
                                    lucene.IndexWriter.MaxFieldLength.LIMITED)
        writer.setMaxFieldLength(1048576)
        self.indexDocs(root, writer, startDate, endDate)
        ticker = Ticker()
        print 'optimizing index',
        threading.Thread(target=ticker.run).start()
        writer.optimize()
        writer.close()
        ticker.tick = False
        print 'done'
Exemplo n.º 11
0
def main1():
    print "started indexing sample files......"
    direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    config = lucene.IndexWriterConfig(lucene.Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(lucene.IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = lucene.IndexWriter(direc, config)

    #fix this later.....FieldType not defined
    #field_type=lucene.FieldType()
    #field_type.setIndexed(True)
    #field_type.setStored(False)
    #field_type.setTokenized(False)

    file1 = open("nitin.json")
    data = file1.read()
    contents = json.loads(data)
    doc = lucene.Document()
    field = lucene.Field("name", contents['name'], lucene.Field.Store.NO,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    field = lucene.Field("data", data, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    writer.addDocument(doc)
    file1.close()

    file1 = open("nitin2.json")
    data = file1.read()
    contents = json.loads(data)
    doc = lucene.Document()
    field = lucene.Field("name", contents['name'], lucene.Field.Store.NO,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    field = lucene.Field("data", data, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    writer.addDocument(doc)
    file1.close()

    writer.optimize()
    print "Indexed and optimized %d documents" % writer.numDocs()
    writer.close()
Exemplo n.º 12
0
def	IndexCreate(fileDir, indexDir):
	analyzer = lucene.StandardAnalyzer()	# 루씬에서 사용하는 객체 생성
	store = lucene.FSDirectory.getDirectory(indexDir)
	writer = lucene.IndexWriter(store, analyzer)

	for root, dirnames, filenames in os.walk(fileDir):	# 입력받은 폴더에서 텍스트 파일만 검색
		for filename in filenames:
			if not filename.endswith('.txt'):
				continue
			
			print("Adding: %s" % filename)
			try:
				path = os.path.join(root, filename)
				f = open(path)
				content = f.read()
				f.close()

				content = content.decode('cp949').encode('utf-8')	# 인코딩을 'utf-8'로 변경

				doc = lucene.Document()				# Document 객체 추가
				doc.add(lucene.Field(	"name", 	# 파일명
										filename,
										lucene.Field.Store.YES,
										lucene.Field.Index.NO))
				doc.add(lucene.Field(	"path", 	# 파일 경로
										path,
										lucene.Field.Store.YES,
										lucene.Field.Index.NO))
				if len(content) > 0:
					doc.add(lucene.Field(	"content", 		# 파일 내용
											content,
											lucene.Field.Store.NO,
											lucene.Field.Index.TOKENIZED))
				else:
					print("Warning: No contents in %s" % filename)
				writer.addDocument(doc)				# 인덱스에 Document 추가
			except Exception, e:
				print("Failed in adding index: %s" % e)
Exemplo n.º 13
0
	def UpdateIndex(self):
		"인덱스를 최신의 내용으로 갱신"
		self.lastIndexingTime = self.__ReadLatestUpdateTime()	# 마지막으로 인덱스한 시간(None-인덱스한 적이 없음)
		writer = lucene.IndexWriter(self.store, self.analyzer, lucene.IndexWriter.MaxFieldLength(1048576))

		for root, dirnames, filenames in os.walk(self.blogDir):
			for filename in filenames:
				if not filename.endswith('.txt'):	# txt 파일이 아닌 경우 인덱스하지 않음	
					continue	

				path = os.path.join(root, filename)
				if (self.lastIndexingTime != None and self.lastIndexingTime >= int(os.stat(path).st_mtime)):
					continue		# 이미 인덱스에 추가된 데이터인 경우

				print("Adding: %s" % filename)
				try:
					f = open(path)
					content = f.read()
					f.close()

					doc = lucene.Document()
					doc.add(lucene.Field(	"bloger", 
											path.rsplit("\\", 2)[1],		# 파일이 들어있는 디렉토리를 블로거로 설정
											lucene.Field.Store.YES,
											lucene.Field.Index.UN_TOKENIZED))
					doc.add(lucene.Field(	"path", 
											path,
											lucene.Field.Store.YES,
											lucene.Field.Index.UN_TOKENIZED))
					doc.add(lucene.Field(	"contents", 
											content,
											lucene.Field.Store.NO,
											lucene.Field.Index.TOKENIZED))
					writer.addDocument(doc)
				except Exception, e:
					print("Failed in adding index: %s" % e)
Exemplo n.º 14
0
    def handle_noargs(self, **options):
        siteconfig = SiteConfiguration.objects.get_current()

        # Refuse to do anything if they haven't turned on search.
        if not siteconfig.get("search_enable"):
            sys.stderr.write('Search is currently disabled. It must be '
                             'enabled in the Review Board administration '
                             'settings to run this command.\n')
            sys.exit(1)

        if not have_lucene:
            sys.stderr.write('PyLucene is required to build the search index.\n')
            sys.exit(1)

        incremental = options.get('incremental', True)

        store_dir = siteconfig.get("search_index_file")
        if not os.path.exists(store_dir):
            os.mkdir(store_dir)
        timestamp_file = os.path.join(store_dir, 'timestamp')

        timestamp = 0
        if incremental:
            try:
                f = open(timestamp_file, 'r')
                timestamp = datetime.utcfromtimestamp(int(f.read()))
                f.close()
            except IOError:
                incremental = False

        f = open(timestamp_file, 'w')
        f.write('%d' % time.time())
        f.close()

        if lucene_is_2x:
            store = lucene.FSDirectory.getDirectory(store_dir, False)
            writer = lucene.IndexWriter(store, False,
                                        lucene.StandardAnalyzer(),
                                        not incremental)
        elif lucene_is_3x:
            store = lucene.FSDirectory.open(lucene.File(store_dir))
            writer = lucene.IndexWriter(store,
                lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT),
                not incremental,
                lucene.IndexWriter.MaxFieldLength.LIMITED)
        else:
            assert False

        status = Q(status='P') | Q(status='S')
        objects = ReviewRequest.objects.filter(status)
        if incremental:
            query = Q(last_updated__gt=timestamp)
            # FIXME: re-index based on reviews once reviews are indexed.  I
            # tried ORing this in, but it doesn't seem to work.
            #        Q(review__timestamp__gt=timestamp)
            objects = objects.filter(query)

        if sys.stdout.isatty():
            print 'Creating Review Request Index'
        totalobjs = objects.count()
        i = 0
        prev_pct = -1

        for request in objects:
            try:
                # Remove the old documents from the index
                if incremental:
                    writer.deleteDocuments(lucene.Term('id', str(request.id)))

                self.index_review_request(writer, request)

                if sys.stdout.isatty():
                    i += 1
                    pct = (i * 100 / totalobjs)
                    if pct != prev_pct:
                        sys.stdout.write("  [%s%%]\r" % pct)
                        sys.stdout.flush()
                        prev_pct = pct

            except Exception, e:
                sys.stderr.write('Error indexing ReviewRequest #%d: %s\n' % \
                                 (request.id, e))
Exemplo n.º 15
0
def index_ontology_files(oboFile, outDir, xref_map):
    """
    Iterates over our list of ontology files and creates an index for each file.
    """
    lucene.initVM()
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

    # Handle a little bit of lucene setup
    filename, _ext = os.path.splitext(os.path.basename(oboFile))

    indexDir = os.path.join(outDir, filename)
    if os.path.exists(indexDir):
        raise ExistingIndexDirectoryException(
            'Error, attempted to index same file twice or index two files named the same'
        )

    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
    writer = lucene.IndexWriter(dir, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength(512))

    for term in oboparser.parse(oboFile, ['is_a']):
        if term.obsolete:
            continue

        doc = lucene.Document()
        add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED)
        add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 4.0)

        # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could
        # query hits that we would not want to occur thus errantly increasing the score of the field.
        # We will strip out these hyperlinks and index just the text.
        add_field_to_document(doc, "definition",
                              strip_urls_from_text(term.definition),
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 0.4)

        # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists
        # in our Ontology object and need to be entered in one at a time
        add_fields_to_document(doc, "synonym",
                               [x[0] for x in term.synonyms if x],
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED, 0.7)

        add_fields_to_document(doc, "alt_id", term.alternateIds,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "xref",
            [replace_xref_identifier(x, xref_map) for x in term.xrefs],
            lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "relationship",
            [" ".join(list(x)) for x in list(term.relationships)],
            lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)
        add_fields_to_document(doc, "subset", term.subsets,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        writer.addDocument(doc)

    writer.optimize()
    writer.close()
Exemplo n.º 16
0
#LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
""" Creates a sample Lucene index for the full-text search feature. """

import lucene
import sys

if __name__ == "__main__":
    lucene.initVM()
    indexDir = "D:/Downloads/index"
    dir_ = lucene.SimpleFSDirectory(lucene.File(indexDir))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = lucene.IndexWriter(dir_, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength(512))

    print("Currently there are %d documents in the index..." %
          writer.numDocs())

    content = (
        "Strategische Konzeption, Umsetzung und Betreuung von langfristig " +
        "hochwirksamen und messbar erfolgreichen Maßnahmen in Social Media.")
    doc = lucene.Document()
    doc.add(
        lucene.Field("content", content, lucene.Field.Store.YES,
                     lucene.Field.Index.ANALYZED))
    doc.add(
        lucene.Field("filePath", "Projekte/bericht.txt",
                     lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
    writer.addDocument(doc)
Exemplo n.º 17
0
    def addMessage(self, username, xprotocol, xfriend_chat, who_sent,
                   timestamp, text):
        #Clean up protocol and friend_chat fields
        """ For some unknown reason, PyLucene (and probably Lucene as well)
            seems to have problems searching for things like SoAndSo but
            has no problems searching for soandso. To prevent headaches in
            the future we simply set it all to lowercase since the case
            does not matter for these fields."""
        protocol = xprotocol.lower()
        friend_chat = xfriend_chat.lower()

        #Determine index and data paths
        index_dir = self.indexdir + username
        data_dir = self.datadir + username + PATH_SEP + protocol + PATH_SEP
        data_file = data_dir + friend_chat

        #if the index doesn't exist, we use a sepcial constructor to create it
        if os.path.isdir(index_dir) == False:
            os.makedirs(index_dir)
            luc_index = lucene.FSDirectory.getDirectory(index_dir, True)
            luc_writer = lucene.IndexWriter(luc_index,
                                            lucene.StandardAnalyzer(), True)
        else:
            luc_index = lucene.FSDirectory.getDirectory(index_dir)
            luc_writer = lucene.IndexWriter(luc_index,
                                            lucene.StandardAnalyzer())
        #Opening the index before writing to the file gives us a lock
        #on the index. As long as writing to data files occurs only
        #through this function, this is guaranteed to be an atomic
        #operation. Closing the writer releases the lock.

        if os.path.isdir(data_dir) == False:
            os.makedirs(data_dir)
        #filesize is used to determine the file offset
        if os.path.isfile(data_file) == False:
            filesize = 0
        else:
            filesize = os.path.getsize(data_file)

        datahandle = open(data_file, 'a')
        datahandle.write(str(who_sent))
        datahandle.write("\n")
        datahandle.write(str(timestamp))
        datahandle.write("\n")
        datahandle.write(str(len(str(text))))  #what a mess
        datahandle.write("\n")
        datahandle.write(str(text))
        datahandle.write("\n")

        doc = lucene.Document()
        doc.add(self.__makeKeywordField('protocol', str(protocol)))
        doc.add(self.__makeKeywordField('friend_chat', str(friend_chat)))
        clean_timestamp = self.__padTimestamp(timestamp)
        doc.add(self.__makeKeywordField('timestamp', clean_timestamp))
        doc.add(self.__makeKeywordField('who_sent', str(who_sent)))
        doc.add(self.__makeUnIndexedField('file_offset', str(filesize)))
        clean_text = re.sub("<[^>]*>", " ", str(text))
        doc.add(self.__makeUnStoredField('text', clean_text))

        luc_writer.addDocument(doc)
        luc_writer.close()
Exemplo n.º 18
0
import os, re, sys, lucene
from BeautifulSoup import BeautifulSoup
import lxml.html

SYMBOL = '[-_\s\n,.<>/?:;\"\'\[\]{ }\\\|`~!@#$%^&*()=\+]+'
pattern = re.compile(SYMBOL)

# Pretreatment
INDEX_DIR = 'index'
lucene.initVM()
directory = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))
analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

writer = lucene.IndexWriter(directory, analyzer, True,
                            lucene.IndexWriter.MaxFieldLength.UNLIMITED)


def visible(element):
    if element.parent.name in [
            'style', 'script', '[document]', 'head', 'title'
    ]:
        return False
    elif re.match('<!--.*-->', str(element)):
        return False
    return True


def get_page_num(file):
    tmp = file.split('/')
    return tmp[1] + '/' + tmp[2]
Exemplo n.º 19
0
 def begin_indexing(self, session, index):
     # will append if exists, or create if not
     if not self.writer:
         self.writer = lucene.IndexWriter(
             self.dir, self.analyzer,
             lucene.IndexWriter.MaxFieldLength.UNLIMITED)
Exemplo n.º 20
0
if __name__ == '__main__':
    INDEX_DIR = "/home/andrew/lucene_index"

    # Initialize lucene and JVM
    lucene.initVM()

    print("lucene version is:", lucene.VERSION)

    # Get the analyzer
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

    # Get index storage
    store = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))

    # Get index writer
    writer = lucene.IndexWriter(store, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength.LIMITED)

    try:
        # create a document that would we added to the index
        doc = lucene.Document()

        # Add a field to this document
        field = lucene.Field("titlendia", lucene.Field.Store.YES,
                             lucene.Field.Index.ANALYZED)

        # Add this field to the document
        doc.add(field)

        # Add the document to the index
        writer.addDocument(doc)