def open_reader(self):
     """Open IndexReader."""
     if self.reader is None:
         if self.use_ram:
             print "reading from ram directory ..."
             self.reader = DirectoryReader.open(self.ram_dir)
         else:
             self.reader = DirectoryReader.open(self.dir)
Пример #2
0
	def __init__ (self):
		
		self.mDocumentDirectory = "/home/hnguyen/Projects/CLIFinder/operations.sub"
		self.mIndexDirectory = "/home/hnguyen/Projects/CLIFinder/cli.index"

		self.mIndexReader = None
		if os.path.isdir(self.mIndexDirectory) and self.mIndexReader == None:
			directory = SimpleFSDirectory(File(self.mIndexDirectory))
			self.mIndexReader = DirectoryReader.open(directory)

		
		############################### IndexingEngine Settings ######################################
		self.mSimilarity = DecreaseLengthNormSimilarity()
		self.mOpenMode = IndexWriterConfig.OpenMode.CREATE
		##############################################################################################
				
		self.mIsDebug = False

		if self.mIsDebug:
		############################### Setting up loggers ###########################################
			self.mIndexingLogPath = "/home/hnguyen/Projects/CLIFinder/logs/indexing.log"
			self.mSearchingLogPath = "/home/hnguyen/Projects/CLIFinder/logs/searching.log"
	
			self.mIndexingLogger = LoggingEngine(self.mIndexingLogPath, "IndexingLogger", Queue.Queue())
			self.mSearchingLogger = LoggingEngine(self.mSearchingLogPath, "SearchingLogger", Queue.Queue())

			self.mIndexingLogger.start()
			self.mSearchingLogger.start()
			atexit.register(self.clear)
def search():

	lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
	args = []
	if request.method == 'POST':
		if request.form['ies']:
			args.append('+ies:'+request.form['ies'])
		if request.form['area']:
			args.append('+area:'+request.form['area'])
		if request.form['professor']:
			args.append('+professor:'+request.form['professor'])
		if request.form['conceito']:
			#args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito'])
			args.append('m:'+request.form['conceito'])
			args.append('d:'+request.form['conceito'])
			args.append('f:'+request.form['conceito'])

	table = []
	if(len(args) > 0): 
		scoreDocs = mansearch.buscar('indexer/',args)
		fsDir = SimpleFSDirectory(File(indexDir))
		searcher = IndexSearcher(DirectoryReader.open(fsDir))
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			table.append(dict((field.name(), field.stringValue()) for field in doc.getFields()))
	return render_template('busca.html',table = table)
	
	pass
def buscar(indexDir, args,options = None):
    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
    fsDir = SimpleFSDirectory(File(indexDir))
    #print fsDir
    
    #Criando buscador baseado no diretorio dos indices passados pelo usuario
    searcher = IndexSearcher(DirectoryReader.open(fsDir))
    
    #Analizador para filtro dos tokens 
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    #print analyzer

    #Criando um QueryParser usando por padrao contents
    #Variavel com as restricoes da busca
    parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
    #print parser

    parser.setDefaultOperator(QueryParser.Operator.AND)

    #print args
    #Juntando parametros passados com o valor do mesmo
    command = ' +'.join(args)
    #print command

    query = parser.parse(command)
    print query

    #Criando um JArray com resultado da consulta
    return searcher.search(query, 200).scoreDocs
	def search(self, input_query=None, max_answers=10):
		''' Searches the given query in the index '''
		if input_query is None:
			return None

		base_dir = '.'
		directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
		searcher = IndexSearcher(DirectoryReader.open(directory))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		
		
		# query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query)
		parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer)
		query = MultiFieldQueryParser.parse(parser, input_query)

		scoreDocs = searcher.search(query, max_answers).scoreDocs
		print "%s total matching documents." % len(scoreDocs)

		docs = []
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields())
			docs.append(doc_dict)
			# print doc
		return docs
Пример #6
0
    def search(self, field, text):
        """
        search text within indexed data

        input:
            field   fieldname of the value that will be indexed
            text    text to search

        output:
            hits    return a list of hits

        """
        results = []
        idx_reader = DirectoryReader.open(self.directory)
        idx_searcher = IndexSearcher(idx_reader)

        # parse query
        parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field, self.analyser)
        query = parser.parse(text)

        # search
        hits = idx_searcher.search(query, 1000).scoreDocs.tolist()
        for hit in hits:
            doc = idx_searcher.doc(hit.doc)
            score = hit.score
            title = doc.get(field)
            url = doc.get("url")
            results.append((score, url, title))

        return results
	def search(self):
		''' Searches the given query in the index '''

		lucene.initVM(vmargs=['-Djava.awt.headless=true'])
		# print 'lucene', lucene.VERSION
		# base_dir = os.path.dirname(os.path.abspath('.'))
		base_dir = '.'
		directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
		searcher = IndexSearcher(DirectoryReader.open(directory))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		

		while True:
			print
			print "Hit enter with no input to quit."
			command = raw_input("Query:")
			if command == '':
				return

			print
			print "Searching for:", command

			query = QueryParser(Version.LUCENE_CURRENT, "title",
								analyzer).parse(command)
			scoreDocs = searcher.search(query, 50).scoreDocs
			print "%s total matching documents." % len(scoreDocs)

			for scoreDoc in scoreDocs:
				doc = searcher.doc(scoreDoc.doc)
				# print 'path:', doc.get("path"), 'name:', doc.get("name")
				print doc
Пример #8
0
 def __init__(self, db_path):
     directory = SimpleFSDirectory(File(db_path))
     reader = DirectoryReader.open(directory)
     self.searcher = IndexSearcher(reader)
     self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
     logger.info("Loaded DB from %s with %d documents: ",
                 db_path, reader.numDocs())
Пример #9
0
    def query(self, data):
        if self.fil.exists():
            searcher = IndexSearcher(DirectoryReader.open(self.d))
            query = QueryParser(
                Version.LUCENE_30,
                "id",
                self.analyzer).parse(
                data['query'])
            hits = searcher.search(query, 100000)

            results = {}

            results['totalHits'] = hits.totalHits
            results['hits'] = {}

            for hit in hits.scoreDocs:
                record = {}
                doc = searcher.doc(hit.doc)
                fields = doc.getFields()
                record['score'] = hit.score
                for field in fields:
                    if field.name() != "id":
                        record[field.name()] = field.stringValue()
                results['hits'][doc.get('id')] = record

            searcher.getIndexReader().close()
            return results
Пример #10
0
    def __init__(self, index_path, method, logger=None, use_default_similarity=False):
        self.index_path=index_path
        directory = SimpleFSDirectory(File(self.index_path))
        self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
        self.reader=DirectoryReader.open(directory)
        self.searcher = IndexSearcher(self.reader)

        # uncomment one of these lines to change the type of parser, query and weight used
        if use_default_similarity:
            self.query_parser=QueryParser
        else:
            self.query_parser=FieldAgnosticQueryParser

        if use_default_similarity:
            similarity=DefaultSimilarity()
            self.useExplainQuery=False
        else:
            similarity=FieldAgnosticSimilarity()
            self.useExplainQuery=True
        # by default, FieldAgnosticSimilarity uses coord factor, can be disabled
##        similarity.useCoord=False

        self.searcher.setSimilarity(similarity)
        self.method=method # never used?
        self.logger=logger
Пример #11
0
def build_corpus(n=0):
    sbcs = texeval_corpus.test_subcorpora
    sbc = sbcs[n]
    # Hack for parallelizing queries, uses one index per domain.
    directory = FSDirectory.open(File(wiki_index+'-'+sbc))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    build_corpus_from_terms_with_wiki(sbc, searcher, analyzer)
Пример #12
0
 def __init__(self,base_dir, index_dir,index_file,queryDict):
     self.baseDir = base_dir
     self.indexFile = os.path.join(index_dir,index_file)
     lucene.initVM(vmargs=['-Djava.awt.headless=true']) # uncomment when run Retrieve separately
     directory = SimpleFSDirectory(File(self.indexFile))
     searcher = IndexSearcher(DirectoryReader.open(directory))
     self.BM25(searcher,queryDict)
     del searcher
 def search_docs(self, value, field="general_info"):
     MAX_RESULTS = 1000
     searcher = IndexSearcher(DirectoryReader.open(self.store))
     query = QueryParser(Version.LUCENE_CURRENT, field,
                         self.analyzer).parse(value)
     topDocs = searcher.search(query, MAX_RESULTS)
     
     return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
def config():
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75.
    searcher.setSimilarity(bm25Sim)
    analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT)
    return searcher,analyzer
Пример #15
0
    def perform_search(self, searchterm, results_per_page, page):
        # if there is a field in the searchterm
        """if ":" in searchterm:
            # processing a query
            parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
            parser.setDefaultOperator(QueryParser.Operator.AND)

            query = parser.parse(searchterm)

        else:
            query = BooleanQuery()
            query_title = TermQuery(Term("title", searchterm))
            query_description = TermQuery(Term("description", searchterm))
            query_content = TermQuery(Term("content", searchterm))

            #  BooleanClause.Occur.MUST for AND queries
            query.add(query_title, BooleanClause.Occur.SHOULD)
            query.add(query_description, BooleanClause.Occur.SHOULD)
            query.add(query_content, BooleanClause.Occur.SHOULD)"""

        # create QueryParser for each field to be searched
        parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer)
        parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer)
        parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)

        # put fields together
        query = BooleanQuery()
        query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        hits = searcher.search(query, results_per_page + (results_per_page * page))
        score_docs = hits.scoreDocs
        count_results = hits.totalHits
        duration = datetime.now() - start

        # results to return
        results = []
        count = 0

        for scoreDoc in score_docs:

            # skip offset
            if count < results_per_page * page:
                count += 1
                continue
            count += 1


            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            results.append(table)

        return results, duration, count_results
Пример #16
0
 def __init__(self, store_dir):
     initVM()
     directory = SimpleFSDirectory(File(store_dir))
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     print 'loaded index: %s' % store_dir
     self.analyzer = {}
     self.analyzer['StandardAnalyzer'] = StandardAnalyzer(Version.LUCENE_CURRENT)
     self.analyzer['SimpleAnalyzer'] = SimpleAnalyzer(Version.LUCENE_CURRENT)
     self.analyzer['ChineseAnalyzer'] = ChineseAnalyzer(Version.LUCENE_CURRENT)
Пример #17
0
 def deleteRec(self, pid):
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
     writer = IndexWriter(self.indexDir, config)
     writer.deleteDocuments(Term('uid', pid))
     writer.commit()
     writer.close()
     self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     return
Пример #18
0
    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
Пример #19
0
    def __init__(self, index_dir):
        #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g'])

        self.index_dir = index_dir
        self.directory = SimpleFSDirectory(File(self.index_dir))
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
Пример #20
0
def init_lucene_search():
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    print 'Index ', INDEX_DIR
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))  # current dir
    directory = SimpleFSDirectory(File(INDEX_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet())

    return searcher, analyzer
Пример #21
0
def is_article_indexed(art_id, index='index'):
    store = SimpleFSDirectory(File(index))
    searcher = IndexSearcher(DirectoryReader.open(store))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, 'art_id', analyzer).parse(str(art_id))

    docs = searcher.search(query, 1).scoreDocs

    return len(docs) > 0
Пример #22
0
  def __init__(self, indexPath):
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION

    #initialize the index
    self.INDEX_DIR = indexPath  #"Clue_Index"
    self.results = None
    self.searcher = IndexSearcher(DirectoryReader.open(
        SimpleFSDirectory(File(self.INDEX_DIR))))

    self.searcher.setSimilarity(BM25Similarity())
Пример #23
0
 def query(self, txt, ant=10):
     """Searches for a person or family by id, name, place, or date"""
     q = QueryParser("text", self.analyzer).parse(txt.replace('/', '\/').lower())
     if not self.searcher:
         self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     scoreDocs = self.searcher.search(q, ant).scoreDocs
     hits = []
     for scoreDoc in scoreDocs:
         doc = self.searcher.doc(scoreDoc.doc)
         hits.append([doc.get("uid"), scoreDoc.score])
     return hits
Пример #24
0
    def __init__(self, **kwargs):
        """ Initialize a new instance of the Searcher

        :param count: The number of counts to return from a query
        :param output: The output directory of the underlying index
        """
        self.count = kwargs.get("count", 100)
        self.output = kwargs.get("root", "index")
        self.store = SimpleFSDirectory(File(self.output))
        self.analyzer = StandardAnalyzer(Version.LUCENE_30)
        self.searcher = IndexSearcher(DirectoryReader.open(self.store))
Пример #25
0
def search(term, n_docs=10, index='index'):
    store = SimpleFSDirectory(File(index))
    searcher = IndexSearcher(DirectoryReader.open(store))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, 'art_body', analyzer).parse(term)

    # str(query.getClass().toString()) == "class org.apache.lucene.search.TermQuery"

    score_docs = searcher.search(query, n_docs).scoreDocs

    return [(score_doc.score, unicode(searcher.doc(score_doc.doc).get('art_body'))) for score_doc in score_docs]
Пример #26
0
 def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
     self._settings = settings
     self._similarity = settings.similarity
     self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
     self._reader = DirectoryReader.open(indexDirectory)
     self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
     self._readerSettingsWrapper = ReaderSettingsWrapper()
     self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks}
     self._readerSettingsWrapper.set = self._setReadSettings
     self._searcher = None
     self._executor = None
     self._reopenSearcher = True
Пример #27
0
def setupDir(dbName):
    global indexDir, searcher
    (user,db) = dbName.split('_', 1)
    #directory = "./searchDB/"+dbName
    directory = "./files/"+user+'/'+db+'/LuceneIndex'
    if not os.path.exists(directory):
        os.mkdir(directory)
    #indexDir = SimpleFSDirectory(File(directory))
    indexDir = SimpleFSDirectory(Paths.get(directory))
    try:
        searcher = IndexSearcher(DirectoryReader.open(indexDir))
    except Exception, e:
        pass
Пример #28
0
def retrieve_wiki(text_query, index_directory_name):
    lucene.initVM()
    directory = FSDirectory.open(File(index_directory_name))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    txt =text_query
    query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt)
    scoreDocs = searcher.search(query, 1000).scoreDocs
    
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        yield doc.get('contents')    
Пример #29
0
 def search(self, q, sex, ant=5, config = None):
     """Searches for a match"""
     query = QueryParser("match", self.analyzer).parse(q.replace('/', '\/'))
     #Hur lägga till sex?
     if not self.searcher:
         self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     scoreDocs = self.searcher.search(query, ant).scoreDocs
     hits = []
     for scoreDoc in scoreDocs:
         doc = self.searcher.doc(scoreDoc.doc)
         if sex == doc.get("sex"):
             hits.append([doc.get("uid"), scoreDoc.score])
     return hits
Пример #30
0
def text_search(command, cpage, meth):
    global vm_env, searcher, analyzer

    vm_env.attachCurrentThread()
    print 'lucene', lucene.VERSION
    directory = SimpleFSDirectory(File(STORE_TEXT_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    text, maxnum = runstext(command, cpage, meth)

    del searcher

    return text, maxnum
Пример #31
0
 def __init__(self):
     #self.segmentor.load('./cws.model')
     INDEXDIR = './Myindex'
     #lucene.initVM(vmargs='-Xcheck:jni,-verbose:jni,-verbose:gc')
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     #vm_env = lucene.getVMEnv()
     #vm_env.attachCurrentThread()
     #lucene.initVM(vmargs='-')
     #print 'lucene', lucene.VERSION
     self.directory = SimpleFSDirectory(File(INDEXDIR))
     self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
     self.analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
     self.reader = IndexReader.open(self.directory)
Пример #32
0
	def __init__(self):
		if luceneImport:
			self.lucene = True
		else:
			self.lucene = False

		#Lucene connection
		lucene.initVM()
		indexDir = "texts/index"
		directory = MMapDirectory(File(indexDir))
		directory = DirectoryReader.open(directory)
		self.analyzer = StandardAnalyzer(Version.LUCENE_30)
		self.searcher = IndexSearcher(directory)
Пример #33
0
    def __init__(self, store_dir, analyzer, preprocess = lambda x: x):
        '''
        Input: `store_dir`: directory storing the Lucene index
               `analyzer`: analyzer required to split the query
               `preprocess`: user-defined preprocess function
        '''
        # Initialize `IndexSearcher`
        self.dir = SimpleFSDirectory(File(store_dir).toPath())
        self.searcher = IndexSearcher(DirectoryReader.open(self.dir))
        self.preprocess = preprocess

        # Initialize `QueryParser`
        self.parser = QueryParser("description", analyzer)
Пример #34
0
    def getHitCount(self, fieldName, searchString):
        reader = DirectoryReader.open(self.dir) #readOnly = True
        print '%s total docs in index' % reader.numDocs()
        
        searcher = IndexSearcher(reader) #readOnly = True
        t = Term(fieldName, searchString)
        query = TermQuery(t)
        hitCount = len(searcher.search(query, 50).scoreDocs)

        reader.close()
        print "%s total matching documents for %s\n---------------" \
              % (hitCount, searchString)
        return hitCount
Пример #35
0
def retrieveDocs(q):
    STORE_DIR = "IndexFiles.index"
    lucene.initVM()
    analyzer = StandardAnalyzer()
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))

    dir = SimpleFSDirectory(File(STORE_DIR).toPath())
    #directory = FSDirectory.getDirectory(File(STORE_DIR))
    reader = DirectoryReader.open(dir)
    #reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(DirectoryReader.open(dir))

    query = QueryParser("contents", analyzer).parse(q)
    #query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(q)
    #query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
    MAX = 1000
    hits = searcher.search(query, MAX)
    nonDiverse = []
    docsToScores = {}
    #create a list of html files with relevant websites
    rQ = []
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        #print doc.get("contents").encode("utf-8")
        #print(new_urls[str(hit.doc)])
        result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString()
        if (len(nonDiverse) < 10):
            nonDiverse.append(new_urls[str(hit.doc)])
        #find the document that corresponds to the html website and append to a list for min distance
        website = new_urls[str(hit.doc)]
        #html_files numbers of the hit websites added to rQ
        rQ.append(inv_map[website])
        docsToScores[int(inv_map[website])] = hit.score
        print(inv_map[website])
    return docsToScores, rQ, nonDiverse
Пример #36
0
def search(music_tags, dir_path):
    lucene.initVM()

    query_str = "content:" + " ".join(music_tags)
    index_dir = SimpleFSDirectory(Paths.get(dir_path))
    lucene_analyzer = StandardAnalyzer()
    lucene_searcher = IndexSearcher(DirectoryReader.open(index_dir))

    my_query = QueryParser("content", lucene_analyzer).parse(query_str)
    total_hits = lucene_searcher.search(my_query, 50)

    for hit in total_hits.scoreDocs:
        doc = lucene_searcher.doc(hit.doc)
        print doc
Пример #37
0
def init_search(search_content, vm_env):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    lucene.initVM()
    STORE_DIR = "index"
    print 'lucene', lucene.VERSION
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    result_s = run(searcher, analyzer, search_content)
    del searcher
    print(result_s)
    return result_s
Пример #38
0
    def runDrillDown(self):
        # open readers
        taxo = DirectoryTaxonomyReader(self.taxoDir)
        indexReader = DirectoryReader.open(self.indexDir)

        for drilldown in drilldownCategories:
            print "search with drilldown: %s" %  '/'.join(drilldown)
            facetRes = SimpleSearcher.searchWithDrillDown(drilldown, indexReader,
                                                          taxo, self.facets_config)
        # close readers
        taxo.close()
        indexReader.close()
        # return result
        return facetRes
Пример #39
0
def func_nr(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_tb_new"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "name",
                        analyzer).parse(command)
    scoreDocs = searcher.search(
        query, 50, Sort([SortField("rate", SortField.Type.DOUBLE,
                                   True)])).scoreDocs
    results = process(scoreDocs, searcher)
    return results
    def similarityOfSynopsis(self):
        directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX))
        ireader = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS,
                                  analyzer)
        for root, dirnames, filenames in os.walk(settings.SYNOPSIS):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    while True:
                        try:
                            query = queryParser.parse(
                                QueryParser.escape(content))
                        except Exception as e:
                            self.boolean_query.setMaxClauseCount(
                                self.boolean_query.maxClauseCount * 2)
                            print self.boolean_query.maxClauseCount
                            continue
                        break

                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs
                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(
                                first_movie=major_movie,
                                second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(
                                    first_movie=minor_movie,
                                    second_movie=major_movie).first()
                            similarity.synopsis = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)
Пример #41
0
def define_search_params(STORE_DIR, FIELD_CONTENTS, TERM):

    store = SimpleFSDirectory(Paths.get(STORE_DIR))
    reader = DirectoryReader.open(store)
    searcher = IndexSearcher(reader)

    # Get the analyzer
    analyzer = WhitespaceAnalyzer()
    # Constructs a query parser. We specify what field to search into.
    queryParser = QueryParser(FIELD_CONTENTS, analyzer)

    # Create the query
    query = queryParser.parse(TERM)
    return searcher, reader, query
def search(command):
    STORE_DIR = "index"
    # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = MMapDirectory(Paths.get(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SmartChineseAnalyzer()
    ans = run(searcher, analyzer, command)
    del searcher
    return ans


# vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
# for y in search('二三四五'):
#     print(y)
Пример #43
0
def main():
    store_dir = "lucene_index"
    if not os.path.isdir(store_dir):
        raise RuntimeError("Cannot find Lucene index at: {}".format(store_dir))
    store = SimpleFSDirectory(Paths.get(store_dir))
    searcher = IndexSearcher(DirectoryReader.open(store))
    analyzer = EnglishAnalyzer()

    # query_string = "House is a simple fact about science reaction"
    # query_string = get_random_question()
    # search(query_string, analyzer, searcher)
    # by_random_question(analyzer, searcher)
    annotate_all_questions(analyzer, searcher)
    del searcher
Пример #44
0
def func_perfumer(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    STORE_DIR = "index_tb_new"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "perfumer",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 233).scoreDocs
    results = process(scoreDocs, searcher)
    return results
def populate_data(path, args):
    name = path.split('/')[-1]
    print(f"Processing {name}")
    all_senses = {}

    all_senses[args.lang1] = {}
    all_senses[args.lang2] = {}

    if args.pivot_lang is not None:
        all_senses[args.pivot_lang] = {}

    all_translation_mappings = []
    if args.pivot_lang is not None:
        all_translation_pivot1_mappings = []
        all_translation_pivot2_mappings = []

    store = SimpleFSDirectory(Paths.get(path))
    dr = DirectoryReader.open(store)
    searcher = IndexSearcher(dr)
    analyzer = StandardAnalyzer()
    query = QueryParser("title", analyzer).parse("*:*")
    topDocs = searcher.search(query, 1000000000)
    for scoreDoc in topDocs.scoreDocs:
        doc = scoreDoc.doc
        language_lemmas = searcher.doc(doc).getValues("LANGUAGE_LEMMA")
        sense_ids = searcher.doc(doc).getValues("ID_SENSE")
        for language_lemma, sense_id in zip(language_lemmas, sense_ids):
            lang = language_lemma[:2]
            lemma = language_lemma[3:]
            if language_lemma[:2] in LANGUAGES_OF_INTEREST:
                all_senses[lang] = {sense_id: lemma}
            if args.pivot_lang is not None and language_lemma[:2] == args.pivot_lang:
                all_senses[args.pivot_lang] = {sense_id: lemma}
        translation_mappings = searcher.doc(doc).getValues("TRANSLATION_MAPPING")
        create_translation_mapping(translation_mappings, all_senses, all_translation_mappings, LANGUAGES_OF_INTEREST)
        if args.pivot_lang is not None:
            create_translation_mapping(translation_mappings, all_senses, all_translation_pivot1_mappings, [args.lang1, args.pivot_lang])
            create_translation_mapping(translation_mappings, all_senses, all_translation_pivot2_mappings, [args.lang2, args.pivot_lang])

    output = open(f'{args.internal_data_path}/{name}.pkl', 'wb')
    pickle.dump(all_translation_mappings, output)
    output.close()
    if args.pivot_lang is not None:
        output = open(f'{args.internal_data_path}/{name}_{args.lang1}-{args.pivot_lang}.pkl', 'wb')
        pickle.dump(all_translation_pivot1_mappings, output)
        output.close()
        output = open(f'{args.internal_data_path}/{name}_{args.lang2}-{args.pivot_lang}.pkl', 'wb')
        pickle.dump(all_translation_pivot2_mappings, output)
        output.close()
Пример #46
0
    def __init__(self):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        print 'lucene', lucene.VERSION

        self.base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
        self.directory = SimpleFSDirectory(File(os.path.join(self.base_dir, INDEX_DIR)))
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)
        self.numDocs = self.reader.maxDoc()

        self.mlt = MoreLikeThis(self.reader)
        self.mlt.setMinTermFreq(1)
        self.mlt.setMinDocFreq(1)

        '''
Пример #47
0
 def GET(self):
     aud = web.input()
     f = login_aud()
     audpath = str(aud['uploadfile'])
     STORE_DIR = "index2"
     vm_env = lucene.getVMEnv()
     vm_env.attachCurrentThread()
     directory = SimpleFSDirectory(File(STORE_DIR))
     searcher = IndexSearcher(DirectoryReader.open(directory))
     analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
     geshou, geming, zhuanji, liupai, shijian, jianjie, geci, imgurl = run_aud(
         searcher, analyzer, audpath)
     del searcher
     return render.result_aud(f, geshou, geming, zhuanji, imgurl, liupai,
                              shijian, jianjie, geci)
def main():
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    queries = makeQueryList(args["queryFile"])
    print 'lucene', lucene.VERSION
    print "\n"

    directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR))
    print directory.getDirectory()
    searcher = IndexSearcher(DirectoryReader.open(directory))
    searcher.setSimilarity(ClassicSimilarity())
    analyzer = StandardAnalyzer()

    run(searcher, analyzer, queries)
    del searcher
Пример #49
0
 def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
     self._settings = settings
     self._similarity = settings.similarity
     self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
     self._reader = DirectoryReader.open(indexDirectory)
     self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
     self._readerSettingsWrapper = ReaderSettingsWrapper()
     self._readerSettingsWrapper.get = lambda: {
         "similarity": self.searcher.getSimilarity().toString(),
         "numberOfConcurrentTasks": self._numberOfConcurrentTasks
     }
     self._readerSettingsWrapper.set = self._setReadSettings
     self._searcher = None
     self._executor = None
     self._reopenSearcher = True
Пример #50
0
    def search(self, query):
        lucene.initVM()
        luceneDirectory = "/index/"

        path = str(os.path.abspath(os.getcwd()) + luceneDirectory)
        directory = FSDirectory.open(Paths.get(path))
        reader = DirectoryReader.open(directory)
        searcher = IndexSearcher(reader)
        analyzer = StandardAnalyzer()

        #args = len(sys.argv) - 1

        #if args < 1:
        #   print ("\n No query was submitted! \n")
        #else:
        #query_string = ""
        #position = 1
        #while(args >= position):
        #query_string = query_string + str(sys.argv[position]) + " "
        #position = position + 1

        print("Searching for '" + query + "'")

        fields_to_search = ["text", "page title", "date"]
        filter_date = 'date:"May 25"'

        filtered_query = filter_date + "AND " + query

        parser = MultiFieldQueryParser(fields_to_search, analyzer)
        updated_query = MultiFieldQueryParser.parse(parser, filtered_query)
        scored_documents = searcher.search(updated_query,
                                           10).scoreDocs  # array of docs

        print("Found " + str((len(scored_documents))) +
              " matches in the collection.")

        results = []
        for doc in scored_documents:
            scoredTweet = dict()
            scoredTweet['score'] = doc.score
            result = searcher.doc(doc.doc)
            scoredTweet['username'] = result.get("username")
            scoredTweet['tweet_body'] = result.get("text")
            scoredTweet['date'] = result.get("date")
            results.append(scoredTweet)
            print(scoredTweet)

        return results
Пример #51
0
def define_search_params(STORE_DIR, FIELD_CONTENTS):
    
    #indexPath = File(STORE_DIR).toPath()
    #indexDir = FSDirectory.open(indexPath)
    
    store = SimpleFSDirectory(Paths.get(STORE_DIR))
    reader = DirectoryReader.open(store)
    searcher = IndexSearcher(reader)
        
    # Get the analyzer
    # analyzer = WhitespaceAnalyzer()
 
    # Constructs a query parser. We specify what field to search into.
    # queryParser = QueryParser(FIELD_CONTENTS, analyzer)
    
    return searcher, reader
Пример #52
0
    def __init__(self, store_dir, analyzer, preprocess=lambda x: x):
        '''
        Input: `store_dir`: directory storing the Lucene index
               `analyzer`: analyzer required to split the query
               `preprocess`: user-defined preprocess function
        '''
        # Initialize `IndexSearcher`
        self.dir = SimpleFSDirectory(File(store_dir).toPath())
        self.searcher = IndexSearcher(DirectoryReader.open(self.dir))
        self.preprocess = preprocess

        # Store Analyzer
        self.analyzer = analyzer

        # Initialize `Formatter`
        self.formatter = SimpleHTMLFormatter('<em>', '</em>')
Пример #53
0
    def __init__(self, lang):
        lucene.initVM()

        if lang == 'zh':
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_SSQA)))
            analyzer = SmartChineseAnalyzer()
        else:
            raise ValueError(
                'lang should be "zh" or "en", {} is invalid!'.format(lang))

        self.reader = DirectoryReader.open(indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = analyzer
        logger.debug('search similarity func: {}'.format(
            self.searcher.getSimilarity()))
Пример #54
0
def search_trip(command):
    '''command must be encoded in unicode'''
    STORE_DIR = "index_trip"
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))

    folders = {
        'parsed_ctrip':
        ['source', 'location', 'introduction', 'score', 'img_list'],
        'parsed_qunar':
        ['location', 'rank', 'score', 'time', 'introduction', 'img_list'],
        'eic_mfw': ['location', 'introduction', 'img_list']
    }
    readers = constructReaders(folders)
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = parseCommand(command)
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    print 'total: %s' % (len(scoreDocs))

    maxf = []
    maxrank = -1000.0
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        filename = doc.get('filename')
        rank = ranker(command_dict, getInfo(folders, readers, filename))
        if rank > maxrank:
            maxf = [filename]
            maxrank = rank
        elif rank == maxrank:
            maxf.append(filename)

    del searcher

    if len(maxf) == 0:
        print "error in searchtrip.py: no result while searching", command_dict.get(
            'location', '')
        return "Interior Error"
    elif len(maxf) != 1:
        print "warning in searchtrip.py: multiple results when searching", command_dict.get(
            'location', '')
    return getInfo(folders, readers, maxf[0])
Пример #55
0
def main(args):
    global verbose
    verbose = args.verbose

    if verbose:
        logger.info(f'Read {args.dir_index}')
    directory = SimpleFSDirectory.open(Paths.get(args.dir_index))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    reader = searcher.getIndexReader()

    if verbose:
        logger.info(f'Write to {args.path_output}')
    with open(args.path_output, 'w') as f:
        for idx in trange(reader.maxDoc()):
            doc = reader.document(idx)
            babelnet_id = doc.get('ID')
            synset_id = doc.get('SYNSET_ID')
            pos = doc.get('POS')
            synset_type = doc.get('TYPE')
            main_sense = doc.get('MAIN_SENSE')
            categories = list(doc.getValues('CATEGORY'))
            translation_mappings = list(doc.getValues('TRANSLATION_MAPPING'))
            images = list(doc.getValues('IMAGE'))
            lemmas = doc.getValues('LEMMA')
            forms = []
            for i in range(len(lemmas)):
                forms.append({
                    'lemma': lemmas[i],
                    'source': doc.getValues('LEMMA_SOURCE')[i],
                    'lang': doc.getValues('LEMMA_LANGUAGE')[i],
                    'weight': doc.getValues('LEMMA_WEIGHT')[i],
                    'sense_key': doc.getValues('LEMMA_SENSEKEY')[i],
                })
            entry = {
                'id': babelnet_id,
                'synset': synset_id,
                'pos': pos,
                'type': synset_type,
                'main_sense': main_sense,
                'categories': categories,
                'translation_mappings': translation_mappings,
                'images': images,
                'forms': forms
            }
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')

    return 0
Пример #56
0
 def __init__(self,
              LUCENE_INDEX_DIR,
              similarity='BM25',
              lucene_vm_flag=False):
     if lucene_vm_flag == False:
         lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     self.lucene_vm_init = True
     self.index_dir = LUCENE_INDEX_DIR
     self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
     self.analyzer = StandardAnalyzer()
     self.config = IndexWriterConfig(self.analyzer)
     self.reader = DirectoryReader.open(self.index_mm)
     self.searcher = IndexSearcher(self.reader)
     self.dict_term_freq = {}
     self.dict_doc_field_title = {}
     if similarity == 'BM25':
         (self.searcher).setSimilarity(BM25Similarity())
Пример #57
0
def main(storeDir):
    reader = DirectoryReader.open(storeDir)
    numDocs = reader.numDocs()
    print("n_docs:", numDocs)

    for i in range(numDocs):
        tvec = reader.getTermVector(i, 'body')
        if tvec is not None:
            termsEnum = tvec.iterator()
            vec = {}
            for term in BytesRefIterator.cast_(termsEnum):
                dpEnum = termsEnum.postings(None)
                dpEnum.nextDoc()
                vec[term.utf8ToString()] = dpEnum.freq()
            print(vec)

    reader.close()
Пример #58
0
    def runSimple(self):
        # open readers
        taxo = DirectoryTaxonomyReader(self.taxoDir)
        indexReader = DirectoryReader.open(self.indexDir)

        for term in searchValues:
            print  "\nsearch by term '%s' ..." % term
            facetRes = SimpleSearcher.searchWithTerm(term, indexReader, taxo,
                                                       self.facets_config)
        print  "\nsearch all documents  ..."
        facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo,
                                                   self.facets_config)
        # close readers
        taxo.close()
        indexReader.close()
        # return result
        return facetRes
Пример #59
0
    def GetDocField(self, docIdx, field=CONTENT):
        """
        Get the document's field

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        - `field`: Field to retrieve (Str).

        :Returns:
        - Document's field. (Str)
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        content = doc.get(field)
        reader.close()

        return content
Пример #60
0
    def __init__(self, path):
        print('Searcher initialized...')
        self.path = path
        self.analyzer = SmartChineseAnalyzer()
        # self.analyzer = WhitespaceAnalyzer(Version.LATEST)
        self.reader = DirectoryReader.open(
            SimpleFSDirectory(Paths.get(self.path)))
        self.searcher = IndexSearcher(self.reader)
        self.thu = thulac.thulac(deli='/')

        file = Path('w2v.model')
        if file.is_file():
            print('Model was already trained...loading model')
            self.w2v_model = Word2Vec.load('w2v.model')
        else:
            self.model_train()
            print('Model trained...')