예제 #1
0
def wikipedia_indexer(storage, wikipedia_file) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open(wikipedia_file)

	for i, line in enumerate(f) :
		text = line.strip().decode('utf-8').split('\t')
		title = text[0]
		if 'disambigu' in text[0] or len(text) < 2:
			continue
		text = text[1]
		doc = Document()
		doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
		doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
		doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
		writer.addDocument(doc)
		if writer.numDocs() % 1000 == 0 :
			print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
		
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()	
def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()
예제 #3
0
def build_index():

    lucene.initVM()

    # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/'
    post_dir = '/Users/w3/data/github/codeif_backup'
    index_store_dir = current_app.config['INDEX_STORE_DIR']
    print post_dir
    print index_store_dir

    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    store = SimpleFSDirectory(File(index_store_dir))
    analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)

    indexDocs(post_dir, writer)
    ticker = Ticker()
    print 'commit index',
    threading.Thread(target=ticker.run).start()
    writer.commit()
    writer.close()
    ticker.tick = False
    print 'done'
class LuceneIndexer:

    def __init__(self, path_to_save):
        self.path_to_save = path_to_save
        self.num_docs = 0
        lucene.initVM()
        self.indexDir = SimpleFSDirectory(File(self.path_to_save))
        self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
        self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
        self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2)
        self.writer = IndexWriter(self.indexDir, self.writerConfig)

    def add_document(self, fields, header, id_):
        doc = Document()
        if len(fields) > len(header):
            sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_))
            for field in fields:
                sys.stderr.write('%s\n' % field)
            return
        for idx, field in enumerate(fields):
            fname, fieldtype = header[idx]
            if fieldtype is IntField:
                field = int(field)
            doc.add(fieldtype(fname, field, Field.Store.YES))
        self.writer.addDocument(doc)
        self.num_docs += 1

    def close(self):
        print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs())
        self.writer.close()
예제 #5
0
	def __init__(self,root,storeDir,analyzer):
		# Create the index dir if it does not exist 
		if not os.path.exists(storeDir):
			os.mkdir(storeDir)
		# the SimpleFSDirectory which the index will be written in
		store = SimpleFSDirectory(File(storeDir))
		analyzer = LimitTokenCountAnalyzer(analyzer,1048576)
		config = IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
		# create a index writer 
		# atach the index dir and config info to it
		writer = IndexWriter(store,config)

		# call the indexing procedure
		# indexing all the files in the directory specified by root
		# write the index with writer
		self.indexDocs(root,writer)
		# start a ticker
		ticker = Ticker()
		print 'commit index'
		threading.Thread(target=ticker.run).start()
		writer.commit()
		writer.close()
		# stop the ticker when the indexing procedure completes
		ticker.tick = False
		print 'Done'
예제 #6
0
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
예제 #7
0
 def reindex(self):
     writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
     indexutils.reindex_all(self.reader, writer, self.corpus.analyzer)
     writer.optimize()
     writer.close()
     self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)})
     self.parent.write({'status': "Ready!"})
예제 #8
0
    def __init__(self, root, store_dir):

        if not os.path.exists(store_dir):
            os.mkdir(store_dir, 0777)


        # NOTE: Hardcoded the analyzer instead of passing it
        lucene.initVM()
        '''
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        '''
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        store = SimpleFSDirectory(File(store_dir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)

        # Set the permissions to 777 for the index directory and the write.lock file
        chmod_indexdir_cmd = "chmod 0777 " + store_dir
        writelock_file = store_dir + "/" + "write.lock"
        chmod_writelock_cmd = "chmod 0777 " + writelock_file

        if os.path.exists(store_dir):
            cicmd=os.popen("sudo -S %s"%(chmod_indexdir_cmd), 'w').write('vagrant')

        if os.path.exists(writelock_file):
            cwcmd=os.popen("sudo -S %s"%(chmod_writelock_cmd), 'w').write('vagrant')

        # setting CREATE will rewrite over the existing indexes.
        ###config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        writer.close()
예제 #9
0
def create_index(storage, paths) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	import os
	for path in paths :
		for filen in os.listdir(path) :
			text = sent_tokenize(get_data_from_file(path + filen))
			total_sent = len(text)
			for i in range(0, total_sent, 3) :
				doc = Document()
				a = i-5 if i-5 > 0 else 0
				sentence = ' '.join(text[a:i+5])
				doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			print("Done %s" % (path+filen))
			print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
예제 #10
0
    def removeindex(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        writer.deleteDocuments(lucene.Term("_id", data['record']['_id']))

        writer.optimize()
        writer.close()
예제 #11
0
 def deleteRec(self, pid):
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
     writer = IndexWriter(self.indexDir, config)
     writer.deleteDocuments(Term('uid', pid))
     writer.commit()
     writer.close()
     self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     return
예제 #12
0
    def updateindex(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        doc = self.buildDocument(data['fields'], data['record'])
        writer.updateDocument(lucene.Term("_id", data['record']['_id']), doc)

        writer.optimize()
        writer.close()
예제 #13
0
    def index(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        doc = self.buildDocument(data['fields'], data['record'])
        writer.addDocument(doc)

        writer.commit()
        writer.close()
예제 #14
0
    def rebuildIndex(self, data):
        writer = IndexWriter(
            self.d, self.conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE))

        for record in data['records']:
            doc = self.buildDocument(data['fields'], record)
            writer.addDocument(doc)

        writer.commit()
        writer.close()
예제 #15
0
def indexer(docNumber, docText):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File("index/"))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    doc = Document()
    doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
    writer.addDocument(doc)
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
예제 #16
0
class Indexer(object):
    def __init__(self, **kwargs):
        """ Initialize a new instance of the Indexer

        :param output: The output directory of the underlying index
        :param anaylzer: The overloaded analyzer to work with
        """
        self.output = kwargs.get("root", "index")
        if not os.path.exists(self.output):
            os.mkdir(self.output)

        self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.store = SimpleFSDirectory(File(self.output))
        self.writer = IndexWriter(self.store, self.config)
        self.create_field_types()

    def index(self, document):
        """ Given a new document, add it to the index.

        :param document: The document to add to the indexer
        """
        try:
            self.writer.addDocument(document)
        except Exception:
            logger.exception("Failed to index the supplied document")

    def shutdown(self):
        """ Shutdown the currently processing indexer.
        """
        try:
            # self.writer.optimize()
            self.writer.close()
        except Exception:
            logger.exception("Failed to shutdown the indexer correctly")

    def create_field_types(self):
        """ Create the field types that will be used to specify
        what actions lucene should take on the various fields
        supplied to index.
        """
        self.field_clean = FieldType()
        self.field_clean.setIndexed(True)
        self.field_clean.setStored(True)
        self.field_clean.setTokenized(False)
        self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        self.field_dirty = FieldType()
        self.field_dirty.setIndexed(True)
        self.field_dirty.setStored(False)
        self.field_dirty.setTokenized(True)
        self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
예제 #17
0
def make_index(indexed_data, index_destination, source='directory'):
    #index wiki articles based on ck 12 topics
    #analyzer = StandardAnalyzer(Version.LUCENE_30)
    analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET)
    indexWriterConfig = IndexWriterConfig(Version.LUCENE_30, analyzer)
    writer = IndexWriter(SimpleFSDirectory(File(index_destination)), indexWriterConfig)
    if source == 'directory':
        indexDirectory(indexed_data, writer)
    else:
        indexDictionary(indexed_data, writer)
    writer.close()
예제 #18
0
    def __init__(self, root, storeDir, analyzer):
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        writer.commit()
        writer.close()
예제 #19
0
class WikiPageIndex():
    def __init__(self, index_dir):
        #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g'])

        self.index_dir = index_dir
        self.directory = SimpleFSDirectory(File(self.index_dir))
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        self.searcher = IndexSearcher(DirectoryReader.open(self.directory))

    def createIndex(self):
        self.writer = IndexWriter(self.directory, self.config)

        if not os.path.exists(self.index_dir):
            os.mkdir(self.index_dir)

    def addDocumentToIndex(self, title, text):
        doc = Document()

        doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED))

        self.writer.addDocument(doc)

    def closeIndex(self):
        self.writer.commit()
        self.writer.close()


    def searchIndex(self, queryString, field="Text", max_results=100):
        query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(queryString)
        scoreDocs = self.searcher.search(query, max_results).scoreDocs
        log.debug("Found {0} documents for query [{1}]".format(len(scoreDocs), queryString))

        docs = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            log.debug(WikiPageIndex.cleanWikiText(doc.get("Text")))

            #print("title: {0}\ncontents: {1}".format(doc.get("Title"), doc.get("Text")[:70]))
            docs.append(doc)

        return docs

    @staticmethod
    def cleanWikiText(text):
        text = text.encode('ascii', 'ignore')
        text = re.sub('(\[\[.*?\]\]|\{\{.*?\}\}|\{\|.*?\|\})', '', text)
        text = re.sub('[^\na-zA-Z0-9\n_-]+', ' ', text)
        text = re.sub('([ \t]*[\n]+[ \t]*)+', '\n', text)
        return text.strip()
예제 #20
0
 def import_csv_with_content(self, csv_file, content_field):
     try:
         writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
         changed_rows = addmetadata.add_metadata_and_content_from_csv(self.searcher, self.reader, writer, csv_file, content_field, self.args_dir)
         writer.close()
     except UnicodeDecodeError:
         try:
             writer.close()
         except:
             pass
         self.parent.write({'error': 'CSV import failed: file contained non-unicode characters. Please save the file with UTF-8 encoding and try again!'})
         return
     self.parent.write({'message': "CSV import complete: %s rows added." % (changed_rows,)})
예제 #21
0
 def dummyIndex(self):
     """
     Create a dummy index - to avoid problems updating it
     """
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(self.indexDir, config)
     doc = Document()
     doc.add(Field('uid', 'dummy', StringField.TYPE_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     return
예제 #22
0
    def __init__(self, indexPath):
        """Instantiate the handler object."""
        self.indexPath = indexPath
        self.analyzer = StopAnalyzer()
        
        # Make sure the path exists
        if not os.path.exists(self.indexPath):
            os.mkdir(self.indexPath)

        if not os.path.exists(os.path.join(self.indexPath, 'segments.gen')):
            log('Creating new index.')
            writer = IndexWriter(self.indexPath, self.analyzer, 1)
            writer.close()
def lucene_index(texts):
    """

    :param corpus_file_path:
    :param f_type:
    :return:
    """
    index = set_lucene_index['ind']  # nonlocal variable index
    config = IndexWriterConfig(version, analyzer)
    writer = IndexWriter(index, config)

    for t in texts:
        addDoc(writer, t)
    writer.close()
예제 #24
0
 def buildIndex(self, inputFile):
     analyzer = self.getAnalyzer()
     iwconf = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
     
     iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter( SimpleFSDirectory( File(self.luceneDir) ), iwconf)
     
     # read through input file and write out to lucene
     counter = 0
     linesReadCounter = 0
     
     with open(inputFile, 'r') as lines:
         linesRead = 0
         
         for line in lines:
             try:
                 linesRead+=1
                 
                 if linesRead % 1000 == 0:
                     print "%d lines read" % linesRead
                     
                 cui, concept = line.replace("\",\"", "\t").replace("\"", "").split("\t")
                 concept = concept.strip()
                 cui = cui.strip()
                 
                 strNorm = self.normalizeCasePunct(concept)
                 strSorted = self.sortWords(strNorm)
                 strStemmed = self.stemWords(strNorm)
                 strStemmedSorted = self.stemWords(strSorted)
       
                 fdoc = Document()
                 
                 counter +=1
                 fid = counter
                 
                 fdoc.add( Field("id", unicode(fid), Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("cui", cui, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str", concept, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_norm", strNorm, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_sorted", strSorted, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_stemmed", strStemmed, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_stemmedSorted", strStemmedSorted, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 writer.addDocument(fdoc)
                 if fid % 1000 == 0:
                     writer.commit()
             except:
                 "Skipping line: %s" % line
                 
     writer.commit()
     writer.close()
예제 #25
0
def index(analyzer, index_dest_dir, documents):
    """ Builds Lucene index from provided documents using given analyzer
    :param analyzer:
    :param index_dest_dir:
    :param list[Document] documents:
    :return:
    """
    if not all([isinstance(d, Document) for d in documents]):
        raise TypeError("documents should be iterable of type Document! Given: %s" % type(documents[0]))

    writer_config = IndexWriterConfig(Version.LUCENE_30, analyzer)
    writer = IndexWriter(SimpleFSDirectory(File(index_dest_dir)), writer_config)
    for doc in documents:
        writer.addDocument(doc)
    writer.close()
예제 #26
0
 def __init__(self, root, storeDir, analyzer): 
     if not os.path.exists(storeDir): 
         os.mkdir(storeDir) 
     store = SimpleFSDirectory(File(storeDir)) 
     analyzer = LimitTokenCountAnalyzer(analyzer, 1000)#1048576 
     config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) 
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) 
     writer = IndexWriter(store, config) 
     self.indexDocs(root, writer) 
     ticker = Ticker() 
     print 'commit index', 
     threading.Thread(target=ticker.run).start() 
     writer.commit() 
     writer.close() 
     ticker.tick = False 
     print 'done'
def rollback(collection_name):
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT

	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)

	#setting writer configurations
	config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
	config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
	writer=IndexWriter(direc,config)

	writer.rollback()
	writer.close()
def delete(primary_keys_map,collection_name,todelete,commit=False):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT

	try:
		tofind_keyvalue_pairs=json.loads(todelete)
	except:
		return 100	
	

	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)

	#setting writer configurations
	try:
		config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer=IndexWriter(direc,config)
		ireader=IndexReader.open(direc)
	except:
		return 105

	###as of now deletion of documents support is only based on indexed keys.###################3 
	tofind_primary_keyvalue_pairs={}
	tofind_nonprimary_keyvalue_pairs={}

	#separating out primary and non_primary keys
	for key in tofind_keyvalue_pairs.keys():
		if key in primary_keys_map:
			tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]
		else:
			tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]

	#filtering documents according to primary keys		
	query=BooleanQuery()
	for key in tofind_primary_keyvalue_pairs.keys():
		temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key])
		query.add(BooleanClause(temp,BooleanClause.Occur.MUST))

	a=writer.deleteDocuments(query)
	if commit==True:
		writer.commit()
	writer.close()
	return 000;
예제 #29
0
    def xmlrpc_indexDocument(self, instance, id, text):
        """Index a new document."""
        self.xmlrpc_unindexDocument(instance, id)

        # Create a document and add two fields to it. 
        doc = Document()
        doc.add(Field('id', id, Field.Store.YES, Field.Index.UN_TOKENIZED))
        doc.add(Field('text', text, Field.Store.YES, Field.Index.TOKENIZED))
        doc.add(Field('instance', instance, Field.Store.YES, Field.Index.UN_TOKENIZED))

        # Write the document into the index.
        writer = IndexWriter(self.indexPath, self.analyzer, 0)
        writer.addDocument(doc)
        writer.optimize()
        writer.close()
        log('Insert: Instance: %s Document: %s' %(instance, id))
        return 1
    def __init__(self, destination_directory, analyzer):

        if not os.path.exists(destination_directory):
            os.mkdir(destination_directory)

        store = SimpleFSDirectory(File(destination_directory))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.tweetIndexer(writer)
        ticker = Ticker()
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
예제 #31
0
파일: IndexFiles.py 프로젝트: elfdown/ee208
    def __init__(self, root, storeDir, relationFile):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        # store = SimpleFSDirectory(File(storeDir).toPath())
        store = SimpleFSDirectory(Paths.get(storeDir))
        # analyzer = StandardAnalyzer()
        analyzer = WhitespaceAnalyzer()
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.read_relation(relationFile)
        self.indexDocs(root, writer)
        ticker = Ticker()
        print('commit index')
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
예제 #32
0
파일: app.py 프로젝트: kcobos/boesearcher
def indexar():
    directory = SimpleFSDirectory(Paths.get("./lucene/index"))
    analyzer = SpanishAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, config)

    doc_names = os.listdir("./documentos")
    indexados = 0
    for dn in doc_names:
        d = open("./documentos/" + dn, "r")
        bs = BeautifulSoup(d, "lxml")
        d.close()
        doc = Document()
        doc.add(
            Field("id", bs.documento.metadatos.identificador.text,
                  StringField.TYPE_STORED))
        doc.add(
            Field("titulo", bs.documento.metadatos.titulo.text,
                  StringField.TYPE_STORED))
        doc.add(
            Field("pdf", bs.documento.metadatos.url_pdf.text,
                  StringField.TYPE_STORED))
        doc.add(Field("texto", bs.documento.texto.text, TextField.TYPE_STORED))
        writer.addDocument(doc)
        indexados += 1
    writer.commit()
    writer.close()

    return render_template("indexados.html",
                           lucene=lucene.VERSION,
                           indexados=indexados)


# @app.route("/slides")
# def slides():
#     return render_template("slides/index.html")
예제 #33
0
파일: TwIndexer.py 프로젝트: skopp002/cs242
def indexing(datadir):
    indexedDocs = 0
    doc = Document()
    #index_outdir = str(input("Enter index output dir: "))
    path = Paths.get('indexOut')
    indexOut = SimpleFSDirectory(path)
    analyzer = EnglishAnalyzer()
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(indexOut, config)
    for filename in glob.iglob(datadir + '/*.json*', recursive=True):
        try:
            print("Filename is", filename)
            #pdb.set_trace()
            with open(filename) as f:
                for line in f:
                    tweet=json.loads(line)
                    if(tweet['lang']=='en'):
                        doc.add(StringField("id", tweet['id_str'], Field.Store.YES))
                    # doc.add(Field("screen_name", tweet['user.screen_name']))
                    # print(tweet['user.screen_name'])
                    # doc.add(Field("name", tweet['user.name']))
                    #doc.add(Field("location", tweet['user.location']))
                    #print(tweet['user.location'])
                        doc.add(TextField("text",tweet['text'],Field.Store.YES))
                    #doc.add(Field("created_at", DateTools.stringToDate(tweet['created_at']),Field.Store.YES))
                        doc.add(TextField("created_at", tweet['created_at'], Field.Store.YES))
                    # doc.add(IntPoint("followers", tweet['user.followers_count'],Field.Store.YES))
                    # doc.add(IntPoint("friends", tweet['friends_count'],Field.Store.YES))
                        writer.addDocument(doc)
                        writer.commit()
                        indexedDocs+=1
        except:
            continue


    writer.close()
    print("Indexed ", indexedDocs, " documents")
예제 #34
0
def main(src, dst):
    try:
        start_time = time.time()

        print "Indexing starts..."
        indicesDestination = File(dst)
        #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        #Analyzer : 본문이나 제목 등의 텍스트를 색인하기 전에 반드시 분석기를 거쳐 단어로 분리해야 한다. Analyzer 클래스는 Directory와 함께 IndexWrite 클래스의 생성 메소드에 지정하며 지정된 텍슽트를 색인할 단위 단어로 분리하고 필요 없는 단어를 제거하는 등의 역할을 담당

        analyzer = KeywordAnalyzer(
        )  #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.)
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }  #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict())
        wrapper_analyzer = PerFieldAnalyzerWrapper(
            analyzer, a
        )  #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
        #SimpleFSDirectory 옵션은 파일시스템에 특정 디렉토리에 인덱스 파일을 저장하겠다. DB, RAM, File system 3개가 있음
        #config 는 IndexWriter 사용에 필요한 Analyzed 된 token이다.

        counter = Counter()
        generate_indices_from_projects(src, writer, counter)
        writer.close()
        print "Done"
        print str(counter)
        print "$$$%s\tseconds" % (time.time() - start_time)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
def main():
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True
    except:
        print('JavaVM already running')

    LUCENE_INDEX_DIR = 'mmapDirectory\\trec_v21_para_uri'
    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = SimpleAnalyzer()
    config = IndexWriterConfig(analyzer)
    config = config.setRAMBufferSizeMB(512.0)  # experimental setting !!
    # write data to index

    #if not is_index_Exist:
    if True:
        print('begin backup code files')
        system_flag = platform.system()
        if system_flag == 'Windows':
            cmd = 'robocopy %s %s\code_files *.py' % (r'%cd%',
                                                      LUCENE_INDEX_DIR)
            os.system(cmd)
        else:
            cmd = 'mkdir %s/code_files' % (LUCENE_INDEX_DIR)
            os.system(cmd)
            cmd = 'cp -f *.py %s/code_files' % (LUCENE_INDEX_DIR)
            os.system(cmd)

        w = IndexWriter(index_mm, config)
        makeIndex(w)
        w.close()
    else:
        print('index already exists, stop indexing')
예제 #36
0
    def index(cls, indexDir, taxoDir, facets_config):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        config = IndexWriterConfig(WhitespaceAnalyzer())
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir,
                                       IndexWriterConfig.OpenMode.CREATE)
        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # obtain the sample facets for current document
            facets = categories[docNum]
            author = authors[docNum]
            # ... and use the FacetField class for adding facet fields to
            # the Lucene document (and via FacetsConfig to the taxonomy index)
            doc.add(FacetField("Author", author))
            for f in facets:
                doc.add(FacetField("Categories", f))
            # finally add the document to the index
            iw.addDocument(facets_config.build(taxo, doc))
            nDocsAdded += 1

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        iw.close()
        taxo.close()
        print "Indexed %d documents with facets." % nDocsAdded
예제 #37
0
    def __init__(self, root, storeDir):

        self.root=root
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)
        self.store=store
        self.Analyzer=analyzer
        self.success=0

        self.indexDocs(root, writer)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
예제 #38
0
파일: syntax.py 프로젝트: zoudajia/rencos
def build_index(file_dir):
    indexDir = SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/"))
    config = IndexWriterConfig(WhitespaceAnalyzer())
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(indexDir, config)

    # t1 = FieldType()
    # t1.setStored(True)
    # t1.setTokenized(False)
    # t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
    #
    # t2 = FieldType()
    # t2.setStored(True)
    # t2.setTokenized(True)
    # t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    print("%d docs in index" % writer.numDocs())
    if writer.numDocs():
        print("Index already built.")
        return
    with open(file_dir + "/train/train.ast.src") as fc:

        codes = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in fc.readlines()
        ]

    for k, code in enumerate(codes):
        doc = Document()
        doc.add(StoredField("id", str(k)))
        doc.add(TextField("code", code, Field.Store.YES))

        writer.addDocument(doc)

    print("Closing index of %d docs..." % writer.numDocs())
    writer.close()
예제 #39
0
class Indexer(object):
    """Usage: python IndexFiles <doc_directory>"""
    def __init__(self, index_dir):
        print("lucene:", lucene.VERSION)
        self.index_dir = index_dir
        store = SimpleFSDirectory(Paths.get(self.index_dir))
        analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(store, config)

    def build_index(self, dict_data):
        print("loading data...")
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for k, v in dict_data.items():
            doc = Document()
            doc.add(Field("id", k, t1))
            doc.add(Field("content", v, t2))
            self.writer.addDocument(doc)

        ticker = Ticker()
        print("commit index")
        threading.Thread(target=ticker.run).start()
        self.writer.commit()
        self.writer.close()
        ticker.tick = False
        print("done")
예제 #40
0
def main():
	try:
		print "Indexing starts..."
		indicesDestination = File("/Users/Falcon/Desktop/New_Indices/IJA_Indices")

		analyzer = KeywordAnalyzer()  
		a = {"code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)}
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) 				
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

		writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
		counter = Counter()
		generate_indices_from_projects(writer, counter)
		writer.close()

		print "Done"
		print str(counter)

	except CorruptIndexException as e:		#when index is corrupt
			e.printStackTrace()
	except LockObtainFailedException as e:	#when other writer is using the index
			e.printStackTrace()
	except IOException as e:	#when directory can't be read/written
			e.printStackTrace()
예제 #41
0
def index():
    indexFile = File(luceneDirectory).toPath()
    directory = FSDirectory.open(indexFile)

    analyzer = StandardAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, 128479)
    writeConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, writeConfig)

    file_number = 2
    while file_number <= 200:
        data = []
        file_name = './parsed/parsed_data' + str(file_number) + '.txt'
        with open(file_name) as f:
            for line in f:
                data.append(json.loads(line))
        f.close()

        for j in data:
            doc = create_doc(j)
            writer.addDocument(doc)

        file_number += 1
    writer.close()
예제 #42
0
def create_index(index) :
	indexDir = SimpleFSDirectory(File(index))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open('f:/nlp/data/questions/combine.txt')
	for line in f :
		line = get_data_from_text(line.decode('utf-8'))
		doc = Document()
		field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED)
		field.setBoost(2.0)
		doc.add(field)
		writer.addDocument(doc)
	
	print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
예제 #43
0
def indexer(documents_file):
    analyzer = StandardAnalyzer()
    # creating a directory on the RAM
    directory = RAMDirectory()
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, config)
    # indexing the documents
    doc = Document()
    lines = documents_file.readlines()
    length = len(lines)
    for line_number in range(length):
        # indexing document ID
        if lines[line_number].startswith(".U"):
            doc_id = lines[line_number + 1].strip()
            writer.addDocument(doc)
            doc = Document()
            doc.add(Field("DocID", doc_id, TextField.TYPE_STORED))
        # indexing document description
        elif lines[line_number].startswith(".W"):
            paragraph = lines[line_number + 1].strip()
            paragraph = search.stop_words(paragraph)
            doc.add(Field("DocParagraph", paragraph, TextField.TYPE_STORED))
        # indexing document title
        elif lines[line_number].startswith(".T"):
            paragraph = lines[line_number + 1].strip()
            paragraph = search.stop_words(paragraph)
            doc.add(Field("DocParagraph", paragraph, TextField.TYPE_STORED))
        # indexing document keywords
        elif lines[line_number].startswith(".M"):
            paragraph = lines[line_number + 1].strip()
            paragraph = search.stop_words(paragraph)
            doc.add(Field("DocParagraph", paragraph, TextField.TYPE_STORED))
    writer.addDocument(doc)
    writer.close()

    return directory, analyzer
예제 #44
0
class Indexer():
    """This class provide functions to index article stored in the database."""
    def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Indexer.

        Parameters
        ----------
        index_dir : string
            The location of lucene index
        mode : string
            The mode when opening lucene index. Available values are:
                'create', open new index and overwriting over index,
                'append', open existed index and append.
                'create_or_append', if `index_dir` exists, 'append',
                else 'create'
        date_format : string
            We save datetime field as string, `date_format` specify how to
            format datetime into string.
        """
        # self.store = FSDirectory.open(File(index_dir))
        self.store = FSDirectory.open(Paths.get(index_dir))
        # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.analyzer = StandardAnalyzer()
        # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config = IndexWriterConfig(self.analyzer)
        self.mode = mode
        self.date_format = date_format
        if mode == 'create_or_append':
            self.config.setOpenMode(
                IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        elif mode == 'create':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        elif mode == 'append':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
        else:
            raise ValueError('Invalid mode %s', mode)
        self.writer = IndexWriter(self.store, self.config)

    def index_one(self, article):
        """Create index for one url object in the database.
        """
        try:
            date_published_str = article['date_published'].strftime(
                self.date_format)
        except Exception as e:
            logger.warning('Error when formating date_published %r: %s ',
                           article['canonical_url'], e)
            return
        doc = Document()
        doc.add(StoredField('group_id', article['group_id']))
        doc.add(StoredField('article_id', article['article_id']))
        doc.add(
            StringField('date_published', date_published_str, Field.Store.YES))
        doc.add(
            SortedDocValuesField('date_published',
                                 BytesRef(date_published_str)))
        doc.add(StoredField('date_published', date_published_str))
        doc.add(StringField('domain', article['domain'], Field.Store.YES))
        doc.add(StringField('site_type', article['site_type'],
                            Field.Store.YES))
        doc.add(
            TextField('canonical_url', article['canonical_url'],
                      Field.Store.YES))
        doc.add(TextField('title', article['title'], Field.Store.YES))
        doc.add(TextField('meta', article['meta'], Field.Store.NO))
        doc.add(TextField('content', article['content'], Field.Store.NO))
        doc.add(StoredField('uq_id_str', article['uq_id_str']))
        self.writer.addDocument(doc)

    def close(self):
        """Close the index writer."""
        self.writer.close()
예제 #45
0
INDEXDIR = SimpleFSDirectory(Paths.get(indexDir))
indexWriter = IndexWriter(INDEXDIR, config)

for root, dirnames, filenames in os.walk(docDir):
    for filename in filenames:
        print filename
        url = filename.replace("()", "/").replace(".txt", "")
        # print url
        if not filename.endswith('.txt'):
            continue
        path = os.path.join(root, filename)
        path = os.path.abspath(os.path.normpath(path))
        with open(path, 'r') as c:
            contents = unicode(c.read(), 'utf-8')

        doc = Document()
        urlField = Field('url', url, TextField.TYPE_STORED)
        doc.add(urlField)
        nameField = Field('name', filename, TextField.TYPE_STORED)
        doc.add(nameField)
        pathField = Field('path', path, TextField.TYPE_STORED)
        doc.add(pathField)
        contentsField = Field('contents', contents, TextField.TYPE_STORED)
        doc.add(contentsField)

        indexWriter.addDocument(doc)
indexWriter.commit()
indexWriter.close()
end = datetime.now()
print '建立索引花费时间:', (end - start)
예제 #46
0
def create_index_from_folder(folder, index_file):
    """Lets Lucene create an index of all database files within a specified folder

    :param folder: absolute or relative path to database files
    :param index_file: absolute or relative output location for index

    Notes:
    - Does not go through database folder recursively, i.e. all files have to be at the root of the folder
    - Only CSV files are supported
    - Column headers are hardcoded and should follow:
        ID, text, Reddit ID, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold
    """
    # Set up Lucene
    print()
    print("Starting Lucene ...")
    lucene.initVM()
    index_store = SimpleFSDirectory.open(File(index_file).toPath())
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index_store, config)

    print()
    # Go through files, add rows of each as Documents to writer
    for file in os.listdir(folder):
        if file.endswith(".csv"):
            print("Indexing {} ...".format(file), end=" ", flush=True)
            with open(os.path.join(folder, file), newline='') as db:
                reader = csv.reader(db)

                # The Reddit database seems to carry a lot of duplicate posts, so we try to skip those
                post_ids = set()
                duplicate_counter = 0

                # To store term vectors (used for query expansion) we have to use a custom fieldtype
                customfield = FieldType()
                customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
                customfield.setStored(True)
                customfield.setTokenized(True)
                customfield.setStoreTermVectors(True)

                # CSV files have a useless first row...
                skipfirst = True
                # ... and a useless first column. Skip both.
                for _, text, rid, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold in reader:
                    if skipfirst:
                        skipfirst = False
                        continue
                    doc = Document()

                    if rid in post_ids:
                        duplicate_counter += 1
                        continue  # skip
                    else:
                        post_ids.add(rid)

                    # Tokenize, index and store
                    doc.add(Field("text", text, customfield))

                    # Index and store
                    doc.add(StringField("id", rid, Field.Store.YES))
                    doc.add(
                        StringField("subreddit", subreddit, Field.Store.YES))
                    doc.add(StringField("meta", meta, Field.Store.YES))
                    doc.add(StringField("time", time, Field.Store.YES))
                    doc.add(StringField("author", author, Field.Store.YES))

                    # Store only
                    doc.add(StoredField("ups", ups))
                    doc.add(StoredField("downs", downs))
                    doc.add(StoredField("authorlinkkarma", authorlinkkarma))
                    doc.add(StoredField("authorkarma", authorkarma))
                    doc.add(StoredField("authorisgold", authorisgold))

                    writer.addDocument(doc)

            print("DONE!\t(Duplicate posts skipped: {})".format(
                duplicate_counter))

    writer.commit()
    writer.close()

    print()
    print("Finished indexing!")
예제 #47
0
from org.apache.lucene.analysis.standard import StandardAnalyzer

if __name__ == "__main__":
    lucene.initVM()
    path = Paths.get('index')
    indexDir = SimpleFSDirectory(path)
    analyzer = StandardAnalyzer()
    writerConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from sys.stdin..."
    todo = get_all_rawtext_ids()
    for n, i in enumerate(todo):
        try:
            html = get_rawtext_by_id(i).html
            root = LH.fromstring(html)
            text = root.text_content().strip()
        except:
            #print "Failed to parse doc"
            continue
        doc = Document()
        # print text
        doc.add(TextField("text", text, Field.Store.NO))
        doc.add(StoredField("id", i))
        writer.addDocument(doc)
        if n % 1000 == 0:
            print "Indexed %d files (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
예제 #48
0
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
analyzer = StandardAnalyzer()

# store the index in memory
directory = RAMDirectory()

# # store the index in File System
# directory = FSDirectory()

config = IndexWriterConfig(analyzer)
iwriter = IndexWriter(directory, config)
doc = Document()
text = "This is the text to be indexed."
doc.add(Field("fieldname", text, TextField.TYPE_STORED))
iwriter.addDocument(doc)
iwriter.close()

# now search the index
ireader = DirectoryReader.open(directory)
isearcher = IndexSearcher(ireader)
# parse a simple query that searches for "text"
parser = QueryParser("fieldname", analyzer)
query = parser.parse("text")
hits = isearcher.search(query, 1000).scoreDocs

for hit in hits:
    result = isearcher.doc(hit.doc)
    print(result.get("fieldname"))

for txtName in gutenberg_list:
    words = nltk.corpus.gutenberg.words(txtName)
예제 #49
0
class LuceneSearch():
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()

    def get_title_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        title_id = {}
        id_title = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            title = doc['title']
            title_id[title] = idd
            id_title[idd] = title

        return title_id, id_title

    def add_doc(self, doc_id, title, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            txt_ = txt.lower()
            words_idx, words = utils.text2idx2([txt_], self.vocab,
                                               prm.max_terms_per_doc)
            words_idx = words_idx[0]
            words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)

    def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0

        import corpus_hdf5
        corpus = corpus_hdf5.CorpusHDF5(docs_path)
        for txt in corpus.get_text_iter():
            title = corpus.get_article_title(doc_id)
            self.add_doc(doc_id, title, txt, add_terms)
            if doc_id % 1000 == 0:
                print 'indexing doc', doc_id
            doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()

    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)

        return out

    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND',
                              '\\AND').replace('OR',
                                               '\\OR').replace('NOT', '\\NOT')
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))
            except:
                print 'Unexpected error when processing query:', str(q)
                print 'Using query "dummy".'
                q = 'dummy'
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = map(int, doc['word_idx'].split(' '))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                c[int(doc['id'])] = [word_idx, word]

            return c

    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND', '\\AND').replace('OR',
                                                          '\\OR').replace(
                                                              'NOT', '\\NOT')
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape(q))
                except:
                    print 'Unexpected error when processing query:', str(q)
                    print 'Using query "dummy".'
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = map(int, doc['word_idx'].split(' '))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    c[int(doc['id'])] = [word_idx, word]

                out.append(c)

        return out

    def get_candidates(self,
                       qs,
                       max_cand,
                       max_full_cand=None,
                       save_cache=False,
                       extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2,
                                          self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand,
                                                max_full_cand,
                                                self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2,
                                           self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand,
                                                 max_full_cand,
                                                 self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in itertools.izip(out, terms):
                for cand_id, term in itertools.izip(
                        outt.keys()[:max_full_cand], termss.values()):
                    outt[cand_id] = term

        if save_cache:
            for q, c in itertools.izip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out
예제 #50
0
class Lucene(object):

    # default fieldnames for id and contents
    FIELDNAME_ID = "id"
    FIELDNAME_CONTENTS = "contents"

    # internal fieldtypes
    # used as Enum, the actual values don't matter
    FIELDTYPE_ID = "id"
    FIELDTYPE_ID_TV = "id_tv"
    FIELDTYPE_TEXT = "text"
    FIELDTYPE_TEXT_TV = "text_tv"
    FIELDTYPE_TEXT_TVP = "text_tvp"
    FIELDTYPE_TEXT_NTV = "text_ntv"
    FIELDTYPE_TEXT_NTVP = "text_ntvp"

    def __init__(self, index_dir, max_shingle_size=None):
        global lucene_vm_init

        if not lucene_vm_init:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
            lucene_vm_init = True
        self.dir = SimpleFSDirectory(File(index_dir))
        self.max_shingle_size = max_shingle_size
        self.analyzer = None
        self.reader = None
        self.searcher = None
        self.writer = None
        self.ldf = None

    @staticmethod
    def get_version():
        """Get Lucene version."""
        return Version.LUCENE_48

    @staticmethod
    def preprocess(text):
        """Tokenize and stop the input text."""
        ts = StandardTokenizer(Lucene.get_version(),
                               StringReader(text.lower()))
        ts = StopFilter(Lucene.get_version(), ts,
                        StopAnalyzer.ENGLISH_STOP_WORDS_SET)
        string_builder = StringBuilder()
        ts.reset()
        char_term_attr = ts.addAttribute(CharTermAttribute.class_)
        while ts.incrementToken():
            if string_builder.length() > 0:
                string_builder.append(" ")
            string_builder.append(char_term_attr.toString())
        return string_builder.toString()

    def get_analyzer(self):
        """Get analyzer."""
        if self.analyzer is None:
            std_analyzer = StandardAnalyzer(Lucene.get_version())
            if self.max_shingle_size is None:
                self.analyzer = std_analyzer
            else:
                self.analyzer = ShingleAnalyzerWrapper(std_analyzer,
                                                       self.max_shingle_size)
        return self.analyzer

    def open_reader(self):
        """Open IndexReader."""
        if self.reader is None:
            self.reader = DirectoryReader.open(self.dir)

    def get_reader(self):
        return self.reader

    def close_reader(self):
        """Close IndexReader."""
        if self.reader is not None:
            self.reader.close()
            self.reader = None
        else:
            raise Exception("There is no open IndexReader to close")

    def open_searcher(self):
        """
        Open IndexSearcher. Automatically opens an IndexReader too,
        if it is not already open. There is no close method for the
        searcher.
        """
        if self.searcher is None:
            self.open_reader()
            self.searcher = IndexSearcher(self.reader)

    def get_searcher(self):
        """Returns index searcher (opens it if needed)."""
        self.open_searcher()
        return self.searcher

    def set_lm_similarity_jm(self, method="jm", smoothing_param=0.1):
        """
        Set searcher to use LM similarity.

        :param method: LM similarity ("jm" or "dirichlet")
        :param smoothing_param: smoothing parameter (lambda or mu)
        """
        if method == "jm":
            similarity = LMJelinekMercerSimilarity(smoothing_param)
        elif method == "dirichlet":
            similarity = LMDirichletSimilarity(smoothing_param)
        else:
            raise Exception("Unknown method")

        if self.searcher is None:
            raise Exception("Searcher has not been created")
        self.searcher.setSimilarity(similarity)

    def open_writer(self):
        """Open IndexWriter."""
        if self.writer is None:
            config = IndexWriterConfig(Lucene.get_version(),
                                       self.get_analyzer())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            self.writer = IndexWriter(self.dir, config)
        else:
            raise Exception("IndexWriter is already open")

    def close_writer(self):
        """Close IndexWriter."""
        if self.writer is not None:
            self.writer.close()
            self.writer = None
        else:
            raise Exception("There is no open IndexWriter to close")

    def add_document(self, contents):
        """
        Adds a Lucene document with the specified contents to the index.
        See LuceneDocument.create_document() for the explanation of contents.
        """
        if self.ldf is None:  # create a single LuceneDocument object that will be reused
            self.ldf = LuceneDocument()
        self.writer.addDocument(self.ldf.create_document(contents))

    def get_lucene_document_id(self, doc_id):
        """Loads a document from a Lucene index based on its id."""
        self.open_searcher()
        query = TermQuery(Term(self.FIELDNAME_ID, doc_id))
        tophit = self.searcher.search(query, 1).scoreDocs
        if len(tophit) == 1:
            return tophit[0].doc
        else:
            return None

    def get_document_id(self, lucene_doc_id):
        """Gets lucene document id and returns the document id."""
        self.open_reader()
        return self.reader.document(lucene_doc_id).get(self.FIELDNAME_ID)

    def print_document(self, lucene_doc_id, term_vect=False):
        """Prints document contents."""
        if lucene_doc_id is None:
            print "Document is not found in the index."
        else:
            doc = self.reader.document(lucene_doc_id)
            print "Document ID (field '" + self.FIELDNAME_ID + "'): " + doc.get(
                self.FIELDNAME_ID)

            # first collect (unique) field names
            fields = []
            for f in doc.getFields():
                if f.name() != self.FIELDNAME_ID and f.name() not in fields:
                    fields.append(f.name())

            for fname in fields:
                print fname
                for fv in doc.getValues(
                        fname):  # printing (possibly multiple) field values
                    print "\t" + fv
                # term vector
                if term_vect:
                    print "-----"
                    termfreqs = self.get_doc_termfreqs(lucene_doc_id, fname)
                    for term in termfreqs:
                        print term + " : " + str(termfreqs[term])
                    print "-----"

    def get_lucene_query(self, query, field=FIELDNAME_CONTENTS):
        """Creates Lucene query from keyword query."""
        query = query.replace("(", "").replace(")", "").replace("!", "")
        return QueryParser(Lucene.get_version(), field,
                           self.get_analyzer()).parse(query)

    def analyze_query(self, query, field=FIELDNAME_CONTENTS):
        """
        Analyses the query and returns query terms.

        :param query: query
        :param field: field name
        :return: list of query terms
        """
        qterms = []  # holds a list of analyzed query terms
        ts = self.get_analyzer().tokenStream(field, query)
        term = ts.addAttribute(CharTermAttribute.class_)
        ts.reset()
        while ts.incrementToken():
            qterms.append(term.toString())
        ts.end()
        ts.close()
        return qterms

    def get_id_lookup_query(self, id, field=None):
        """Creates Lucene query for searching by (external) document id."""
        if field is None:
            field = self.FIELDNAME_ID
        return TermQuery(Term(field, id))

    def get_and_query(self, queries):
        """Creates an AND Boolean query from multiple Lucene queries."""
        # empty boolean query with Similarity.coord() disabled
        bq = BooleanQuery(False)
        for q in queries:
            bq.add(q, BooleanClause.Occur.MUST)
        return bq

    def get_or_query(self, queries):
        """Creates an OR Boolean query from multiple Lucene queries."""
        # empty boolean query with Similarity.coord() disabled
        bq = BooleanQuery(False)
        for q in queries:
            bq.add(q, BooleanClause.Occur.SHOULD)
        return bq

    def get_phrase_query(self, query, field):
        """Creates phrase query for searching exact phrase."""
        phq = PhraseQuery()
        for t in query.split():
            phq.add(Term(field, t))
        return phq

    def get_span_query(self, terms, field, slop, ordered=True):
        """
        Creates near span query

        :param terms: list of terms
        :param field: field name
        :param slop: number of terms between the query terms
        :param ordered: If true, ordered search; otherwise unordered search
        :return: lucene span near query
        """
        span_queries = []
        for term in terms:
            span_queries.append(SpanTermQuery(Term(field, term)))
        span_near_query = SpanNearQuery(span_queries, slop, ordered)
        return span_near_query

    def get_doc_phrase_freq(self, phrase, field, slop, ordered):
        """
        Returns collection frequency for a given phrase and field.

        :param phrase: str
        :param field: field name
        :param slop: number of terms in between
        :param ordered: If true, term occurrences should be ordered
        :return: dictionary {doc: freq, ...}
        """
        # creates span near query
        span_near_query = self.get_span_query(phrase.split(" "),
                                              field,
                                              slop=slop,
                                              ordered=ordered)

        # extracts document frequency
        self.open_searcher()
        index_reader_context = self.searcher.getTopReaderContext()
        term_contexts = HashMap()
        terms = TreeSet()
        span_near_query.extractTerms(terms)
        for term in terms:
            term_contexts.put(term,
                              TermContext.build(index_reader_context, term))
        leaves = index_reader_context.leaves()
        doc_phrase_freq = {}
        # iterates over all atomic readers
        for atomic_reader_context in leaves:
            bits = atomic_reader_context.reader().getLiveDocs()
            spans = span_near_query.getSpans(atomic_reader_context, bits,
                                             term_contexts)
            while spans.next():
                lucene_doc_id = spans.doc()
                doc_id = atomic_reader_context.reader().document(
                    lucene_doc_id).get(self.FIELDNAME_ID)
                if doc_id not in doc_phrase_freq:
                    doc_phrase_freq[doc_id] = 1
                else:
                    doc_phrase_freq[doc_id] += 1
        return doc_phrase_freq

    def get_id_filter(self):
        return FieldValueFilter(self.FIELDNAME_ID)

    def __to_retrieval_results(self, scoredocs, field_id=FIELDNAME_ID):
        """Converts Lucene scoreDocs results to RetrievalResults format."""
        rr = RetrievalResults()
        if scoredocs is not None:
            for i in xrange(len(scoredocs)):
                score = scoredocs[i].score
                lucene_doc_id = scoredocs[i].doc  # internal doc_id
                doc_id = self.reader.document(lucene_doc_id).get(field_id)
                rr.append(doc_id, score, lucene_doc_id)
        return rr

    def score_query(self,
                    query,
                    field_content=FIELDNAME_CONTENTS,
                    field_id=FIELDNAME_ID,
                    num_docs=100):
        """Scores a given query and return results as a RetrievalScores object."""
        lucene_query = self.get_lucene_query(query, field_content)
        scoredocs = self.searcher.search(lucene_query, num_docs).scoreDocs
        return self.__to_retrieval_results(scoredocs, field_id)

    def num_docs(self):
        """Returns number of documents in the index."""
        self.open_reader()
        return self.reader.numDocs()

    def num_fields(self):
        """Returns number of fields in the index."""
        self.open_reader()
        atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader)
        return atomic_reader.getFieldInfos().size()

    def get_fields(self):
        """Returns name of fields in the index."""
        fields = []
        self.open_reader()
        atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader)
        for fieldInfo in atomic_reader.getFieldInfos().iterator():
            fields.append(fieldInfo.name)
        return fields

    def get_doc_termvector(self, lucene_doc_id, field):
        """Outputs the document term vector as a generator."""
        terms = self.reader.getTermVector(lucene_doc_id, field)
        if terms:
            termenum = terms.iterator(None)
            for bytesref in BytesRefIterator.cast_(termenum):
                yield bytesref.utf8ToString(), termenum

    def get_doc_termfreqs(self, lucene_doc_id, field):
        """
        Returns term frequencies for a given document field.

        :param lucene_doc_id: Lucene document ID
        :param field: document field
        :return dict: with terms
        """
        termfreqs = {}
        for term, termenum in self.get_doc_termvector(lucene_doc_id, field):
            termfreqs[term] = int(termenum.totalTermFreq())
        return termfreqs

    def get_doc_termfreqs_all_fields(self, lucene_doc_id):
        """
        Returns term frequency for all fields in the given document.

        :param lucene_doc_id: Lucene document ID
        :return: dictionary {field: {term: freq, ...}, ...}
        """
        doc_termfreqs = {}
        vectors = self.reader.getTermVectors(lucene_doc_id)
        if vectors:
            for field in vectors.iterator():
                doc_termfreqs[field] = {}
                terms = vectors.terms(field)
                if terms:
                    termenum = terms.iterator(None)
                    for bytesref in BytesRefIterator.cast_(termenum):
                        doc_termfreqs[field][bytesref.utf8ToString()] = int(
                            termenum.totalTermFreq())
                    print doc_termfreqs[field]
        return doc_termfreqs

    def get_coll_termvector(self, field):
        """ Returns collection term vector for the given field."""
        self.open_reader()
        fields = MultiFields.getFields(self.reader)
        if fields is not None:
            terms = fields.terms(field)
            if terms:
                termenum = terms.iterator(None)
                for bytesref in BytesRefIterator.cast_(termenum):
                    yield bytesref.utf8ToString(), termenum

    def get_coll_termfreq(self, term, field):
        """ 
        Returns collection term frequency for the given field.

        :param term: string
        :param field: string, document field
        :return: int
        """
        self.open_reader()
        return self.reader.totalTermFreq(Term(field, term))

    def get_doc_freq(self, term, field):
        """
        Returns document frequency for the given term and field.

        :param term: string, term
        :param field: string, document field
        :return: int
        """
        self.open_reader()
        return self.reader.docFreq(Term(field, term))

    def get_doc_count(self, field):
        """
        Returns number of documents with at least one term for the given field.

        :param field: string, field name
        :return: int
        """
        self.open_reader()
        return self.reader.getDocCount(field)

    def get_coll_length(self, field):
        """ 
        Returns length of field in the collection.

        :param field: string, field name
        :return: int
        """
        self.open_reader()
        return self.reader.getSumTotalTermFreq(field)

    def get_avg_len(self, field):
        """ 
        Returns average length of a field in the collection.

        :param field: string, field name
        """
        self.open_reader()
        n = self.reader.getDocCount(
            field)  # number of documents with at least one term for this field
        len_all = self.reader.getSumTotalTermFreq(field)
        if n == 0:
            return 0
        else:
            return len_all / float(n)
예제 #51
0
class LuceneSearch(object):
    def __init__(self, args):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.args = args

        index_folder = os.path.join(DATA_DIR, args.index_folder)
        if not os.path.exists(index_folder):
            self.doc_db = DocDB()
            logger.info(f'Creating index at {index_folder}')
            self.create_index(index_folder)

        fsDir = MMapDirectory(Paths.get(index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))
        self.searcher.setSimilarity(MyTFIDFSimilarity())
        self.analyzer = MySimpleAnalyzer(
            CharArraySet(collections.JavaSet(utils.STOPWORDS), True))
        self.pool = ThreadPool(processes=args.num_search_workers)

    def add_doc(self, title, text, tokens):

        doc = Document()
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", text, self.t2))
        doc.add(Field("token", tokens, self.t3))

        self.writer.addDocument(doc)

    def create_index(self, index_folder):
        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(True)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(
            MySimpleAnalyzer(
                CharArraySet(collections.JavaSet(utils.STOPWORDS), True)))
        writerConfig.setSimilarity(MyTFIDFSimilarity())
        writerConfig.setRAMBufferSizeMB(16384.0)  # 14g
        self.writer = IndexWriter(fsDir, writerConfig)
        logger.info(f"{self.writer.numDocs()} docs in index")
        logger.info("Indexing documents...")

        doc_ids = self.doc_db.get_doc_ids()
        for doc_id in tqdm(doc_ids, total=len(doc_ids)):
            text = self.doc_db.get_doc_text(doc_id)
            tokens = self.doc_db.get_doc_tokens(doc_id)
            self.add_doc(doc_id, text, tokens)

        logger.info(f"Indexed {self.writer.numDocs()} docs.")
        self.writer.forceMerge(1)  # to increase search performance
        self.writer.close()

    def search_multithread(self, qs, ranker_doc_max, searcher):
        self.ranker_doc_max = ranker_doc_max
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)

        return out

    def search_multithread_part(self, q):
        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            if self.args.ngram == 2:
                query = self._parse_query(field_name='text', query=q)
            else:
                # self.args.ngram == 1
                query = QueryParser('text',
                                    self.analyzer).parse(QueryParser.escape(q))
        except Exception as e:
            logger.warning(colored(f'{e}: {q}, use query dummy.'), 'yellow')
            if self.args.ngram == 2:
                query = self._parse_query(field_name='text', query=q)
            else:
                # self.args.ngram == 1
                query = QueryParser('text', self.analyzer).parse('dummy')

        doc_scores, doc_titles, doc_texts, doc_words = [], [], [], []
        hits = self.curr_searcher.search(query, self.ranker_doc_max)

        for i, hit in enumerate(hits.scoreDocs):
            doc = self.curr_searcher.doc(hit.doc)

            doc_score = hit.score
            doc_title = doc['title']
            doc_word = doc['token'].split('<&>')
            doc_text = doc['text']

            doc_scores.append(doc_score)
            doc_titles.append(doc_title)
            doc_words.append(doc_word)
            doc_texts.append(doc_text)

        if len(doc_scores) == 0:
            logger.warning(
                colored(
                    f'WARN: search engine returns no results for query: {q}.',
                    'yellow'))

        return doc_scores, doc_titles, doc_texts, doc_words

    def search_singlethread(self, qs, ranker_doc_max, curr_searcher):
        out = []
        for q in qs:
            try:
                if self.args.ngram == 2:
                    query = self._parse_query(field_name='text', query=q)
                else:
                    # self.args.ngram == 1
                    query = QueryParser('text', self.analyzer).parse(
                        QueryParser.escape(q))
            except Exception as e:
                logger.warning(colored(f'{e}: {q}, use query dummy.'),
                               'yellow')
                if self.args.ngram == 2:
                    query = self._parse_query(field_name='text', query=q)
                else:
                    # self.args.ngram == 1
                    query = QueryParser('text', self.analyzer).parse('dummy')

            doc_scores, doc_titles, doc_texts, doc_words = [], [], [], []
            hits = curr_searcher.search(query, ranker_doc_max)

            for i, hit in enumerate(hits.scoreDocs):
                doc = curr_searcher.doc(hit.doc)

                doc_score = hit.score
                doc_title = doc['title']
                doc_word = doc['token'].split('<&>')
                doc_text = doc['text']

                doc_scores.append(doc_score)
                doc_titles.append(doc_title)
                doc_words.append(doc_word)
                doc_texts.append(doc_text)

            if len(doc_scores) == 0:
                logger.warning(
                    colored(
                        f'WARN: search engine returns no results for query: {q}.',
                        'yellow'))

            out.append((doc_scores, doc_titles, doc_texts, doc_words))

        return out

    def batch_closest_docs(self, qs, ranker_doc_max):

        if self.args.num_search_workers > 1:
            out = self.search_multithread(qs, ranker_doc_max, self.searcher)
        else:
            out = self.search_singlethread(qs, ranker_doc_max, self.searcher)

        return out

    def _parse_query(self, field_name, query):
        ts = self.analyzer.tokenStream("dummy", StringReader(query))
        termAtt = ts.getAttribute(CharTermAttribute.class_)
        ts.reset()
        tokens = []
        while ts.incrementToken():
            tokens.append(termAtt.toString())
        ts.end()
        ts.close()

        booleanQuery = BooleanQuery.Builder()
        for token in tokens:
            builder = PhraseQuery.Builder()
            for i, word in enumerate(token.split(' ')):
                builder.add(Term(field_name, word), i)
            pq = builder.build()
            booleanQuery.add(pq, BooleanClause.Occur.SHOULD)
        final_query = booleanQuery.build()
        return final_query
예제 #52
0
class Indexer:
    """
        Class which will define our indexer which contains
        the methods of indexing documents.
    """
    ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(CharArraySet(Arrays.asList(
        ["a", "a's", "able", "about", "above", "according", "accordingly", "across", "actually", "after",
         "afterwards", "again", "against", "ain't", "all", "allow", "allows", "almost", "alone", "along", "already",
         "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow",
         "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate",
         "are", "aren't", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away",
         "awfully", "b", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand",
         "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both",
         "brief", "but", "by", "c", "c'mon", "c's", "came", "can", "can't", "cannot", "cant", "cause", "causes",
         "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently",
         "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't",
         "course", "currently", "d", "definitely", "described", "despite", "did", "didn't", "different", "do",
         "does", "doesn't", "doing", "don't", "done", "down", "downwards", "during", "e", "each", "edu", "eg",
         "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever",
         "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f",
         "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly",
         "forth", "four", "from", "further", "furthermore", "g", "get", "gets", "getting", "given", "gives", "go",
         "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't", "happens", "hardly", "has",
         "hasn't", "have", "haven't", "having", "he", "he's", "hello", "help", "hence", "her", "here", "here's",
         "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither",
         "hopefully", "how", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've", "ie", "if", "ignored",
         "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar",
         "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll", "it's", "its", "itself", "j", "just",
         "k", "keep", "keeps", "kept", "know", "knows", "known", "l", "last", "lately", "later", "latter",
         "latterly", "least", "less", "lest", "let", "let's", "like", "liked", "likely", "little", "look",
         "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely",
         "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "n", "name", "namely", "nd",
         "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine",
         "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere",
         "o", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only",
         "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside",
         "over", "overall", "own", "p", "particular", "particularly", "per", "perhaps", "placed", "please", "plus",
         "possible", "presumably", "probably", "provides", "q", "que", "quite", "qv", "r", "rather", "rd", "re",
         "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "s",
         "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed",
         "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven",
         "several", "shall", "she", "should", "shouldn't", "since", "six", "so", "some", "somebody", "somehow",
         "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified",
         "specify", "specifying", "still", "sub", "such", "sup", "sure", "t", "t's", "take", "taken", "tell",
         "tends", "th", "than", "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their", "theirs",
         "them", "themselves", "then", "thence", "there", "there's", "thereafter", "thereby", "therefore",
         "therein", "theres", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "think",
         "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru",
         "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying",
         "twice", "two", "u", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon",
         "us", "use", "used", "useful", "uses", "using", "usually", "uucp", "v", "value", "various", "very", "via",
         "viz", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll", "we're", "we've",
         "welcome", "well", "went", "were", "weren't", "what", "what's", "whatever", "when", "whence", "whenever",
         "where", "where's", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether",
         "which", "while", "whither", "who", "who's", "whoever", "whole", "whom", "whose", "why", "will", "willing",
         "wish", "with", "within", "without", "won't", "wonder", "would", "would", "wouldn't", "x", "y", "yes",
         "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours"]), False))

    def __init__(self, index_dir):
        """

        :param index_dir: the dir where to store the index.
        """
        self.indexDir = index_dir
        if not os.path.exists(index_dir):
            os.mkdir(index_dir)
        self.analyzer = MyPythonEnglishAnalyzer(
            stopwords=self.ENGLISH_STOP_WORDS_SET)
        conf = IndexWriterConfig(self.analyzer)
        conf.setUseCompoundFile(False)
        directory = FSDirectory.open(Paths.get(index_dir))
        self.writer = IndexWriter(directory, conf)

    def index_folder(self, folder2index):
        """
        :param folder2index: the folder to be indexed.
        :return:
        """
        # Browse all the files from root and store the paths
        files = glob.glob(folder2index + '**/*.xml', recursive=True)
        num_lines = len(files)
        print('\n==> Start processing....\n')
        # Iterate in the files paths list
        with tqdm(total=num_lines) as pbar:
            for file in files:
                pbar.update(1)
                doc = WikiDocument(file)  # this parse the wikipedia page
                self.index_document(doc)  # this indexes the wikipedia page
        print("\n==> Please wait ...\n")
        self.writer.commit()
        print('A total of ' + str(self.writer.getDocStats().numDocs) +
              ' have been indexed.')
        self.close()

    def index_document(self, wiki_doc):
        """
        :param wiki_doc: the document to be indexed.
        :return:
        """
        # Method that indexes documents
        i = 0
        for section in wiki_doc.sections:
            doc = Document()
            doc.add(StringField("id_article", wiki_doc.id, Field.Store.YES))
            doc.add(TextField("title_article", wiki_doc.title, Field.Store.YES))
            doc.add(StringField("id_section", str(
                wiki_doc.id) + "_" + str(i), Field.Store.YES))
            doc.add(TextField("title_section", section.title, Field.Store.YES))
            doc.add(TextField("content_section", section.text, Field.Store.YES))
            self.writer.addDocument(doc)
            i += 1

    def close(self):
        # close the index
        self.writer.close()
예제 #53
0
파일: idx.py 프로젝트: mkind/crawler
class Indexer(object):
    """
    The index class contains everything that is needed to index files.

    """
    def __init__(self, dest=None):
        """
        create a apache lucene indexer

        input:
            dest    destination to store index information. If not set, use
                    RAM.

        """
        # where to store information file or ram
        if dest:
            _dir = FSDirectory.open(java.io.File(dest))
        else:
            _dir = RAMDirectory()
        self.directory = _dir

        # analyser
        self.analyser = StandardAnalyzer(Version.LUCENE_CURRENT)

        # index writer
        cfg = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyser)
        cfg.setDefaultWriteLockTimeout(6000)
        self.idx_writer = IndexWriter(self.directory, cfg)

    def add_document(self, url, field, text):
        """
        add a new document to index writer

        input:
            url     the url of the target to be indexed
            field   fieldname of the value that will be indexed
            text    text to be indexed

        """
        doc = Document()
        doc.add(Field('url', url, TextField.TYPE_STORED))
        doc.add(Field(field, text, TextField.TYPE_STORED))
        self.idx_writer.addDocument(doc)

    def close_indexer(self):
        self.idx_writer.close()

    def search(self, field, text):
        """
        search text within indexed data

        input:
            field   fieldname of the value that will be indexed
            text    text to search

        output:
            hits    return a list of hits

        """
        results = []
        idx_reader = DirectoryReader.open(self.directory)
        idx_searcher = IndexSearcher(idx_reader)

        # parse query
        parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field,
                                      self.analyser)
        query = parser.parse(text)

        # search
        hits = idx_searcher.search(query, 1000).scoreDocs.tolist()
        for hit in hits:
            doc = idx_searcher.doc(hit.doc)
            score = hit.score
            title = doc.get(field)
            url = doc.get("url")
            results.append((score, url, title))

        return results
def main():
    """Function to index negative situations and retrive based on input sentence"""

    all_sent_df = pd.read_csv("../data/sentiment_data.csv")
    neg = all_sent_df[all_sent_df["label"] == 1]
    all_neg_phrases = list(neg["phrase"])
    with open("../data/negSituations.txt", "r") as fpointer:
        all_neg_situations = fpointer.readlines()

    all_neg_situations = map(lambda s: s.strip(), all_neg_situations)
    all_neg_phrases = map(lambda s: s.strip(), all_neg_phrases)

    lucene.initVM()
    analyzer = StandardAnalyzer()
    path = Paths.get('negSituationIndex')
    directory = SimpleFSDirectory(path)
    writer_config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, writer_config)

    print(writer.numDocs())
    # INDEXING ALL DOCUMENTS/ARTICLES IN THE CORPUS
    for each in all_neg_situations:
        document = Document()
        document.add(Field("negativeSituations", each, TextField.TYPE_STORED))
        writer.addDocument(document)

    print(writer.numDocs())
    writer.close()

    analyzer = StandardAnalyzer()
    reader = DirectoryReader.open(directory)
    searcher = IndexSearcher(reader)

    # QUERYING FOR A QUESTION
    with open("../data/negative_situation_to_retrieve.txt", "r") as fpointer:
        all_test_sent = fpointer.readlines()
    all_test_sent = map(lambda s: s.strip(), all_test_sent)

    query_parser = QueryParser("negativeSituations", analyzer)

    total_num = 0
    tic = time.time()
    all_ans = []
    for each in all_test_sent:
        total_num = total_num + 1
        if total_num % 1000 == 0:
            print(total_num, time.time() - tic)

        query = query_parser.parse(query_parser.escape(each))
        hits = searcher.search(query, 3)
        docs_scores = [hit.score for hit in hits.scoreDocs]
        current_ans = []
        if docs_scores != []:
            for hit in hits.scoreDocs:
                doc_t = searcher.doc(hit.doc)
                doc_text = doc_t.get("negativeSituations")
                current_ans.append(doc_text)
        else:
            continue

        current_ans = list(set(current_ans))
        all_ans.append(current_ans)

    print(all_ans)
예제 #55
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
       lucene.initVM(vmargs=['-Djava.awt.headless=true'])
       lucene_vm_init = True
    
    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path 
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
    
    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    
    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher1 = IndexSearcher(reader)
    searcher1.setSimilarity(BM25Similarity())
    searcher2 = IndexSearcher(reader)
    w = IndexWriter(index_mm,config)
    # read query
    read_query()
    
    # initialize mongodb client
    mongoObj=Mongo_Object('localhost',27017)
      
    # search
    docDup=set()
    finalDup={}
    
    for i in xrange(len(queries)):
        print 'process query %d' %(i)
        query = queries[i]
        querystr = stemSentence(query[3])
        # build searcher
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        collector = TopScoreDocCollector.create(hitsPerPage);
        searcher1.search(q_lucene, collector);
        hits = collector.topDocs().scoreDocs;
        
        
        # find candidate results after 1st round filter
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            if d['title'] in docDup:
               finalDup[d['title']]=d
               continue
            docDup.add(d['title'])
            
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            title=d['title']
            if d['title'] in docDup:
               continue
            docDup.add(title)
            
            item=(mongoObj.conn_me).find_one({'title':title})
            if item is None:
               continue
            entitylist=item['entitylist'].split('|')
            for en_title in entitylist:
                if title==en_title:
                   continue
                t=Term('title',en_title)
                q=TermQuery(t)
                docs=searcher2.search(q,2)
                if docs.totalHits<=1:
                   continue
                
                docID2=(docs.scoreDocs)[0].doc
                doc=searcher2.doc(docID2)
                finalDup[doc['title']]=doc

    print 'begin to clean index, there are %d dup records' %(len(finalDup))
    for title in finalDup:
        doc=finalDup[title]
        # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract
        
        name=doc['name']
        value=doc['value']
        category=doc['category']
        skos_category=doc['skos_category']
        all_text=doc['all_text']
        raw_name=doc['raw_name']
        raw_value=doc['raw_value']
        abstract=doc['abstract']
        
        print 'process '+title
        t=Term('title',title)
        q=TermQuery(t)
        w.deleteDocuments(q)
        addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract)
    # process remaining records
    #global batch,cnt_batch
    #if cnt_batch>0:
       #w.addDocuments(batch)
       #cnt_batch=0
       #del batch[:]
    w.close()
예제 #56
0
class Indexer(object):
	# Creates index adds it to docs
	# indexDir Directory is where the index is created
	def __init__(self, indexDir):
		f = Paths.get(indexDir)
		self._dir = SimpleFSDirectory(f)
		analyzer = StandardAnalyzer()
		config = IndexWriterConfig(analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
		self._writer = IndexWriter(self._dir, config)
		
	def close(self):
		self._writer.close()

	def getDoc(self, file):
		try:
			f = open(os.getcwd()+FILE_DIR+'/'+file, "r")

			try:
				c = []
				s = BeautifulSoup(f, 'html.parser')
				text = s.findAll(text=True)
				c = filter(tag_vis, text)
				try:
					c = ' '.join(c)
				except Exception as e:
					c = b' '.join(c)
			except Exception as e:
				print(str(e))
				return
			content = TextField("contents", c, Field.Store.YES)
			fileName = str(Paths.get(file)).split('/')[-1]
			fileName = fileName[:fileName.find(".")]
			filename = TextField("filename",
							 fileName,
							 Field.Store.YES)
			path = TextField("filepath",
						 str(os.getcwd()+FILE_DIR+'/'+file),
						 Field.Store.NO)
			doc = Document()
			doc.add(content)
			doc.add(filename)
			doc.add(path)
			return doc
		except Exception as e:
			print(type(Exception).__name__)
			print(str(e))
			return

	def indexFile(self, file):
		if ( self.getDoc(file) is not None ):
			self._writer.addDocument(self.getDoc(file))
	#pass in absolute path when calling this function
	def createIndex(self, path):
		for file in os.listdir(path):
			print(file)
			if os.path.isfile(path+"/"+file):
				self.indexFile(file)
		return self._writer.numDocs()
	def closeWriter(self):
		self._writer.close()
예제 #57
0
def indexDocs(storeDir, analyzer):
    if not os.path.exists(storeDir):
        os.mkdir(storeDir)

    store = SimpleFSDirectory(Paths.get(storeDir))
    analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    root = "wiki-pages-text/"
    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(False)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(False)
    t2.setTokenized(True)
    t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    for root, dirnames, filenames in os.walk(top=root):
        print(root, dirnames, filenames)
        for filename in filenames:
            if not filename.endswith('.txt'):
                continue
            print("adding " + filename)
            try:
                path = os.path.join(root, filename)
                file = open(path, encoding="utf8")
                i = 0
                #contents = file.read()
                while True:
                    i += 1
                    line = file.readline()
                    doc = Document()
                    if not line:
                        break
                    termName = line.split()[0] + ' ' + line.split()[1]
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("line", i, t1))
                    doc.add(Field("termName", termName, t2))
                    doc.add(Field("content", line.replace(termName, ''), t2))

                    writer.addDocument(doc)
                file.close()
                """
                doc = Document()
                doc.add(Field("name", filename, t1))
                doc.add(Field("path", root, t1))
                if len(contents) > 0:
                    doc.add(Field("contents", contents, t2))
                else:
                    print ("warning: no content in " + filename)
                writer.addDocument(doc)
                """
            except Exception as e:
                print("Failed in indexDocs:" + str(e))
    ticker = Ticker()
    print('commit index')
    threading.Thread(target=ticker.run).start()
    writer.commit()
    writer.close()
    ticker.tick = False
    print('done')
예제 #58
0
def lucene_indexing():
    lucene.initVM()
    whole_tokenized_db_cursor = wiki_db_tool.get_cursor(
        config.WHOLE_PROCESS_FOR_RINDEX_DB)
    whole_tokenized_db_cursor.execute("SELECT * from unnamed")

    indexDir = SimpleFSDirectory(Paths.get(str(config.LUCENE_INDEXED)))
    analyzer = PorterStemmerAnalyzer()
    writerConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    lprint("Building lucene index ...")
    with SqliteDict(str(config.WHOLE_WIKI_DB),
                    flag='r',
                    encode=json.dumps,
                    decode=json.loads) as whole_wiki_db:
        for key, value in tqdm(whole_tokenized_db_cursor,
                               total=config.TOTAL_ARTICLE_NUMBER_WHOLE):

            item = json.loads(value)
            article_title = item['title']
            article_clean_text = item['clean_text']
            article_poss = item['poss']

            # TODO: change it to extract abstract wiki?
            # get the first paragraph which has the length >= 50? so weired.
            abs_index = get_first_paragraph_index(whole_wiki_db[article_title])

            if abs_index == -1:  # document too short
                valid_page = False

            # only title
            title_term_list = []
            title_poss_list = []

            # only abstract content
            abstract_term_list = []
            abstract_poss_list = []

            assert len(article_clean_text) == len(article_poss)

            for p_i, (paragraph_text, paragraph_poss) in enumerate(
                    zip(article_clean_text, article_poss)):
                for sent_text, sent_poss in zip(paragraph_text,
                                                paragraph_poss):
                    if p_i == 0:  # In title.
                        title_term_list.extend(sent_text)
                        title_poss_list.extend(sent_poss)
                        continue  # If the terms are in title, we don't include those terms in abstract and article term.
                    else:
                        if p_i == abs_index:  # If the terms are in abstract
                            abstract_term_list.extend(sent_text)
                            abstract_poss_list.extend(sent_poss)

            added_title = article_title
            added_text = " ".join(title_term_list + abstract_term_list)

            doc = Document()
            doc.add(Field("title", added_title, StoredField.TYPE))
            doc.add(Field("text", added_text, TextField.TYPE_STORED))
            writer.addDocument(doc)
    writer.close()
예제 #59
0
class LuceneSearch():
    """Index and search docs.

    Parameters
    ----------
    index_dir : str
        Index of the documents produced by Lucene
    db_path: str
        File path of the SQLlite database containing articles of wikipedia dump.(from DrQA)
    num_search_workers: int (optional), default=8
        Workers to use to accelerate searching.
    """
    def __init__(self,
                 index_dir: str,
                 db_path: str = None,
                 num_search_workers: int = 8) -> None:

        self.env = lucene.getVMEnv()  # pylint: disable=no-member
        if not self.env:
            self.env = lucene.initVM(
                initialheap='28g',  # pylint: disable=no-member
                maxheap='28g',
                vmargs=['-Djava.awt.headless=true'])

        self.num_search_workers = num_search_workers

        if not os.path.exists(index_dir):
            self.doc_db = DocDB(db_path=db_path)
            logger.info('Creating index at %s', index_dir)
            self._create_index(index_dir)

        fs_dir = MMapDirectory(Paths.get(index_dir))
        self.searcher = IndexSearcher(DirectoryReader.open(fs_dir))
        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=num_search_workers)

    def _create_index(self, index_dir: str) -> None:
        """Index documents

        Parameters
        ----------
        index_dir : str
            The dir to store index
        """
        os.mkdir(index_dir)

        TITLE_FIELD = FieldType()  # pylint: disable=invalid-name
        TITLE_FIELD.setStored(True)
        TITLE_FIELD.setIndexOptions(IndexOptions.DOCS)

        TEXT_FIELD = FieldType()  # pylint: disable=invalid-name
        TEXT_FIELD.setStored(True)
        TEXT_FIELD.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        fs_dir = MMapDirectory(Paths.get(index_dir))
        writer_config = IndexWriterConfig(StandardAnalyzer())
        writer_config.setRAMBufferSizeMB(16384.0)  # 14g
        self.writer = IndexWriter(fs_dir, writer_config)
        logger.info("%d docs in index", self.writer.numDocs())
        logger.info("Indexing documents...")

        doc_ids = self.doc_db.get_doc_ids()
        for doc_id in tqdm(doc_ids, total=len(doc_ids)):
            text = self.doc_db.get_doc_text(doc_id)

            doc = Document()
            doc.add(Field("title", doc_id, TITLE_FIELD))
            doc.add(Field("text", text, TEXT_FIELD))

            self.writer.addDocument(doc)

        logger.info("Indexed %d docs.", self.writer.numDocs())
        self.writer.forceMerge(1)  # to increase search performance
        self.writer.close()

    def _search_multithread(
            self, queries: List[str],
            doc_max: int) -> List[List[Dict[str, Union[float, str]]]]:
        args = [(query, doc_max) for query in queries]
        queries_results = self.pool.starmap(self._search_multithread_part,
                                            args)
        return queries_results

    def _search_multithread_part(
            self, query: str,
            doc_max: int) -> List[Dict[str, Union[float, str]]]:
        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            query = QueryParser('text',
                                self.analyzer).parse(QueryParser.escape(query))
        except Exception as exception:  # pylint: disable=broad-except
            logger.warning(colored(f'{exception}: {query}, use query dummy.'),
                           'yellow')
            query = QueryParser('text', self.analyzer).parse('dummy')

        query_results = []
        hits = self.searcher.search(query, doc_max)

        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)

            query_results.append({
                'score': hit.score,
                'title': doc['title'],
                'text': doc['text']
            })

        if not query_results:
            logger.warning(
                colored(
                    f'WARN: search engine returns no results for query: {query}.',
                    'yellow'))

        return query_results

    def _search_singlethread(
            self, queries: List[str],
            doc_max: int) -> List[List[Dict[str, Union[float, str]]]]:
        queries_result = []
        for query in queries:
            try:
                query = QueryParser('text', self.analyzer).parse(
                    QueryParser.escape(query))
            except Exception as exception:  # pylint: disable=broad-except
                logger.warning(
                    colored(f'{exception}: {query}, use query dummy.'),
                    'yellow')
                query = QueryParser('text', self.analyzer).parse('dummy')

            query_results = []
            hits = self.searcher.search(query, doc_max)

            for hit in hits.scoreDocs:
                doc = self.searcher.doc(hit.doc)

                query_results.append({
                    'score': hit.score,
                    'title': doc['title'],
                    'text': doc['text']
                })

            if not query_results:
                logger.warning(
                    colored(
                        f'WARN: search engine returns no results for query: {query}.',
                        'yellow'))

            queries_result.append(query_results)

        return queries_result

    def search(self,
               query: str,
               doc_max: int = 20) -> List[Dict[str, Union[float, str]]]:
        """Search a given query.

        Parameters
        ----------
        query : str
            Anything you want to search
        doc_max : int
            Maximum number of result to return

        Returns
        -------
        Tuple[Any]
            Search results.
        """
        return self.batch_search([query], doc_max=doc_max)[0]

    def batch_search(
            self,
            queries: List[str],
            doc_max: int = 20) -> List[List[Dict[str, Union[float, str]]]]:
        """
        Search a list of queries.

        Parameters
        ----------
        queries : List[str]
            queries list
        doc_max : int, optional, default=20
            maximum number of docs returned by the search engine.

        Returns
        -------
        List[Tuple[Any]]
            Result returned by the search engine.
        """
        if self.num_search_workers > 1:
            result = self._search_multithread(queries, doc_max)
        else:
            result = self._search_singlethread(queries, doc_max)

        return result

    @staticmethod
    def pprint(search_result: List[Dict[str, Union[float, str]]]) -> None:
        """Print the results returned by the doc searcher.

        Parameters
        ----------
        search_result : List[Dict[str, Union[float, str]]]
            Results returned from ranker
        """

        headers = ['Rank', 'Title', 'Text', 'Score']
        table = prettytable.PrettyTable(headers)
        for i, result in enumerate(search_result):
            text, title = result['text'], result['title']
            text = text[:100] + ' ...' if len(text) > 100 else text
            title = title[:30] + ' ...' if len(title) > 30 else title
            table.add_row([i, title, text, '%.5g' % result['score']])
        print('Top Results:')
        print(table)
예제 #60
0
    def buildIndex(self, inputFile):
        analyzer = self.getAnalyzer()
        iwconf = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)

        iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(SimpleFSDirectory(File(self.luceneDir)), iwconf)

        # read through input file and write out to lucene
        counter = 0
        linesReadCounter = 0

        with open(inputFile, 'r') as lines:
            linesRead = 0

            for line in lines:
                try:
                    linesRead += 1

                    if linesRead % 1000 == 0:
                        print "%d lines read" % linesRead

                    cui, concept = line.replace("\",\"",
                                                "\t").replace("\"",
                                                              "").split("\t")
                    concept = concept.strip()
                    cui = cui.strip()

                    strNorm = self.normalizeCasePunct(concept)
                    strSorted = self.sortWords(strNorm)
                    strStemmed = self.stemWords(strNorm)
                    strStemmedSorted = self.stemWords(strSorted)

                    fdoc = Document()

                    counter += 1
                    fid = counter

                    fdoc.add(
                        Field("id", unicode(fid), Field.Store.YES,
                              Field.Index.NOT_ANALYZED))
                    fdoc.add(
                        Field("cui", cui, Field.Store.YES,
                              Field.Index.NOT_ANALYZED))
                    fdoc.add(
                        Field("str", concept, Field.Store.YES,
                              Field.Index.NOT_ANALYZED))
                    fdoc.add(
                        Field("str_norm", strNorm, Field.Store.YES,
                              Field.Index.NOT_ANALYZED))
                    fdoc.add(
                        Field("str_sorted", strSorted, Field.Store.YES,
                              Field.Index.NOT_ANALYZED))
                    fdoc.add(
                        Field("str_stemmed", strStemmed, Field.Store.YES,
                              Field.Index.NOT_ANALYZED))
                    fdoc.add(
                        Field("str_stemmedSorted", strStemmedSorted,
                              Field.Store.YES, Field.Index.NOT_ANALYZED))
                    writer.addDocument(fdoc)
                    if fid % 1000 == 0:
                        writer.commit()
                except:
                    "Skipping line: %s" % line

        writer.commit()
        writer.close()