def perform_search(self, searchterm, results_per_page, page): # if there is a field in the searchterm """if ":" in searchterm: # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) else: query = BooleanQuery() query_title = TermQuery(Term("title", searchterm)) query_description = TermQuery(Term("description", searchterm)) query_content = TermQuery(Term("content", searchterm)) # BooleanClause.Occur.MUST for AND queries query.add(query_title, BooleanClause.Occur.SHOULD) query.add(query_description, BooleanClause.Occur.SHOULD) query.add(query_content, BooleanClause.Occur.SHOULD)""" # create QueryParser for each field to be searched parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer) parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer) parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) # put fields together query = BooleanQuery() query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() hits = searcher.search(query, results_per_page + (results_per_page * page)) score_docs = hits.scoreDocs count_results = hits.totalHits duration = datetime.now() - start # results to return results = [] count = 0 for scoreDoc in score_docs: # skip offset if count < results_per_page * page: count += 1 continue count += 1 doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) results.append(table) return results, duration, count_results
def findPropertyURIs(self, propertyType, max): uris = list() # list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.CLASS_FEATURE_LKB, analyzer) query = parser.parse("\"" + QueryParser.escape(propertyType) + "\"") result = self._searcher.search(query, 1) freq = result.totalHits if max != None: freq = max.intValue() if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.debug("For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max)) print "For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) uris.append(doc.get(FreyaConstants.INST_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return uris
def findDirectTypes(self, instanceUri, max): dTypes = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, "inst", analyzer) query = parser.parse("\"" + QueryParser.escape(instanceUri) + "\"") result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = 0 if max != None: freq = max else: freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) dTypes.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") logging.debug("there are " + str(len(dTypes)) + " unique direct types") return dTypes
def buscar(indexDir, args,options = None): #lucene.initVM(vmargs=['-Djava.awt.headless=true']) fsDir = SimpleFSDirectory(File(indexDir)) #print fsDir #Criando buscador baseado no diretorio dos indices passados pelo usuario searcher = IndexSearcher(DirectoryReader.open(fsDir)) #Analizador para filtro dos tokens analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #print analyzer #Criando um QueryParser usando por padrao contents #Variavel com as restricoes da busca parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer) #print parser parser.setDefaultOperator(QueryParser.Operator.AND) #print args #Juntando parametros passados com o valor do mesmo command = ' +'.join(args) #print command query = parser.parse(command) print query #Criando um JArray com resultado da consulta return searcher.search(query, 200).scoreDocs
def more_like_this(self, result_num, query): result = [] queryparser = QueryParser(Version.LUCENE_CURRENT, "methods_called", self.porter_analyzer) if query: try: query = arranging_query_regex(query=query) # print '4. Right after the regex handling : ', query like_query = queryparser.parse(query) # print '5. Right after the Lucene parser : ', like_query hits = self.searcher.search(like_query, result_num).scoreDocs # filterScoreDosArray = hits.topDocs().scoreDocs; for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) # matched_terms = self.get_matched_keywords(like_query, hit.doc) result.append(doc.get("answer_id")) except Exception as e: print "AnswerSearcher: Error: %s" % e print(traceback.format_exc()) # self.searchermgr.decRef(self.searcher) self.searchermgr.release(self.searcher) self.searcher = None self.directory.close() self.directory = None return result
def get_image_pmcid(pmcid, classes = ""): fields = ["pmcid", "class"] docs = [] location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) # query.setDefaultOperator(QueryParserBase.AND_OPERATOR) #query = query.parse(query, ('4175339','1')) # query.parse(queryString)#"Shigella sonnei" # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 #hits = searcher.search(query, MAX) if classes == "all": queryStr = "pmcid:(" + ' '.join(pmcid) +")" else: queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query q = query.parse(queryStr) hits = searcher.search(q, MAX) for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() docs.append(searcher.doc(hit.doc)) return docs #This will return the image documents that belong to a pmcid(article)
def more_like_this2(self, item_doc, result_num): similar_questions = [] if not item_doc: item_doc.append(ResultItem(None, 1.0, "No Title", 0)) query = "" if item_doc.doc: query += self.document_to_query(item_doc.doc) query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "term", self.analyzer) if query: try: like_query = queryparser.parse(query) hits = self.searcher.search(like_query, result_num).scoreDocs for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) similar_questions.append(doc.get("question_id")) except Exception as e: print "Question Searcher: Error: %s" % e # write_search_log("Question Searcher: Error: %s" % e + "\n") print(traceback.format_exc()) # self.searchermgr.decRef(self.searcher) # self.searchermgr.release(self.searcher) # self.searcher = None # self.directory.close() # self.directory = None return similar_questions # def release(self, searcher):
def Qsearch(self,query): words = seg.segment(query.strip()) #words = self.segmentor.segment(query.strip()) #print ' '.join(words) vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() result = QueryParser(Version.LUCENE_CURRENT, "contents",self.analyzer) result.setPhraseSlop(0) # "\""+' '.join(words)+"\"~0" means words should be continuous query = result.parse("\""+' '.join(words)+"\"~0") totalHits = self.searcher.search(query, 50) #print "%s total matching documents." % totalHits.totalHits #return totalHits.totalHits for hit in totalHits.scoreDocs: #print"Hit Score: ",hit.score, "Hit Doc:",hit.doc, "HitString:",hit.toString() doc= self.searcher.doc(hit.doc) #print doc.get("name").encode("utf-8") #print "----------------------------------------" t = Term('contents',' '.join(words)) #termDocs = ireader.termDocs(t) #for tt in termDocs: # print ireader.document(termDocs.docs).getFeildable('neme'),termDocs.freq() #print self.reader.totalTermFreq(t) return self.reader.totalTermFreq(t)
def search(self, index_dir): # Get handle to index directory directory = SimpleFSDirectory(File(index_dir)) # Creates a searcher searching the provided index. ireader = DirectoryReader.open(directory) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. We specify what field to search into. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(self.query) # Run the query and get top 50 results topDocs = searcher.search(query, self.retrieve_count) # Get top hits scoreDocs = topDocs.scoreDocs doc_ids = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_ids.append(doc.get(FIELD_PATH)) return [int(item) for item in doc_ids]
def retrieve_sents(self): indexDir = self.indexDir query = self.query sent_ind_list = [] # template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) # print indexDir searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.OR) query = parser.parse(query) # print query start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start # print query if self.stats: print("Found %d sentences (in %s) that matched query '%s':" % (len(scoreDocs), duration, query), file=sys.stderr) for scoreDoc in scoreDocs: # print scoreDoc.doc # doc = searcher.doc(scoreDoc.doc) sent_ind_list.append(scoreDoc.doc) return sent_ind_list
def more_like_this2(self, so_items): github_result = [] if not so_items: so_items.append(SOResultItem(None, 1.0, "No Title", 0, "")) query = "" for so_item in so_items: queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if so_item.doc: query += self.document_to_query(so_item.doc) query += self.code_as_text() if query: print "-" * 30 print "UNified Query: %s" % query print "-" * 30 try: like_query = queryparser.parse(query) hits = self.searcher.search(like_query, 10).scoreDocs for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( like_query, hit.doc) # apis = [d.stringValue() for d in doc.getFields("typed_method_call")] print("file__", doc.get("file"), "file_content", doc.get("file_content"), "line_numbers", doc.get("line_numbers")) file_path = "/extdsk/FaCoY/Git_data/G" + doc.get( "file")[24:] print(file_path) content = None try: with open(file_path) as f: content = f.read() except: pass if content: item = GithubResultItem(doc.get("file"), content, matched_terms, hit.score, so_item, doc.get("line_numbers"), hit.doc) # code github_result.append(item) #print("%d. File: %s, Matched: %s, Score: %s" % (i + 1, doc.get("file"), matched_terms, hit.score)) except Exception as e: print "GitSearcher: Error: %s" % e print(traceback.format_exc()) # print Counter(files).most_common(5) return github_result
class SearchImgs(object): def __init__(self, store_dir, analyzer, preprocess = lambda x: x): ''' Input: `store_dir`: directory storing the Lucene index `analyzer`: analyzer required to split the query `preprocess`: user-defined preprocess function ''' # Initialize `IndexSearcher` self.dir = SimpleFSDirectory(File(store_dir).toPath()) self.searcher = IndexSearcher(DirectoryReader.open(self.dir)) self.preprocess = preprocess # Initialize `QueryParser` self.parser = QueryParser("description", analyzer) def search_command(self, command): ''' Interface for other programs to search in a particular index. Input: `command`: raw query in the str format Output: list of documents found in the index ''' command = self.preprocess(command) score_docs = self.search(command) return self.output(score_docs) def search(self, command): ''' Search for the query in the Lucene index. Input: `command`: keyword to be searched Output: score_docs satisfying the requirement ''' query = self.parser.parse(command) return self.searcher.search(query, 50).scoreDocs def output(self, score_docs): ''' Highlight and return the search results. Input: `score_docs`: search results from the index Output: list of documents info found in the index, details includes `title`, `url` and `description` and `action_url` ''' results = [] for score_doc in score_docs: doc = self.searcher.doc(score_doc.doc) result = { 'title': doc.get('url_title'), 'url': doc.get('img_url'), 'description': doc.get('description').replace(' ', ''), 'action_url': doc.get('url') } results.append(result) return results
def more_like_this2(self, limit, item_doc, score_logs_for_each, user_query, flag): #flag = UQ(1) or not(0) bench_result = [] query = "" if flag == 1: query += user_query # item_doc = ResultItem(None, 0.0, "No Title", 'None','None', None) if flag == 0 and item_doc.doc: query += self.document_to_query(item_doc.doc) query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if query: try: parsed_query = queryparser.parse(query) hits = self.searcher.search(parsed_query, limit).scoreDocs temp = 1 for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched = doc.get('file').split('/')[9].split('.')[0] score_logs_for_each += str(matched) + '\t' + str( round(hit.score, 2)) + '\n' matched_terms = self.get_matched_keywords2( parsed_query, hit.doc) # print "Matched Terms : ", matched_terms # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content")) temp += 1 file_path = doc.get("file") content = None try: with open(file_path) as f: content = f.read() except: pass if content: item = BenchResultItem(doc.get("file"), content, matched_terms, hit.score, item_doc, doc.get("line_numbers"), hit.doc) bench_result.append(item) except Exception as e: print "BenchSearcher Error: %s" % e print(traceback.format_exc()) # self.searchermgr.release() # self.searcher = None # self.directory.close() # self.directory = None return bench_result, score_logs_for_each
def pairSearch(self, pair, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: title = pair[0].replace('_', ' ') content = pair[1] parser = QueryParser("content_section", self.analyzer) query1 = parser.parse(QueryParser.escape(title)) query2 = parser.parse(QueryParser.escape(content)) bq = BooleanQuery.Builder() bq.add(query1, BooleanClause.Occur.FILTER) bq.add(query2, BooleanClause.Occur.SHOULD) self.searcher.setSimilarity(sim) hits = self.searcher.search(bq.build(), 6).scoreDocs return hits
class SearchIndex(object): def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer) def search(self, q, page=1, duplicates=False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream( "contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight }) del self.searcher totalPages = int(math.ceil(results.getTotalHits() / float(perPage))) return totalPages, docs def addDuplicatesQuery(self, query): not_duplicate = TermQuery(Term('duplicate', 'false')) booleanQuery = BooleanQuery() booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST) booleanQuery.add(query, BooleanClause.Occur.MUST) return booleanQuery
class SearchIndex(object): def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer) def search(self, q, page = 1, duplicates = False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight} ) del self.searcher totalPages = int(math.ceil(results.getTotalHits()/float(perPage))) return totalPages, docs def addDuplicatesQuery(self, query): not_duplicate = TermQuery(Term('duplicate', 'false')) booleanQuery = BooleanQuery() booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST) booleanQuery.add(query, BooleanClause.Occur.MUST) return booleanQuery
def simpleSearchID(self, query, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: parser = QueryParser("id_section", self.analyzer) query = parser.parse(QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def query_parser_filter(self, field_values, field_filter=['Vector']): """ Filtering queries according to field values :param field_values: values of the fields :param field_filter: fields to filter """ assert len(field_filter) == len( field_values), "Number of fields different from number of values" for i in range(len(field_filter)): query_parser = QueryParser(field_filter[i], self.analyzer) query = query_parser.parse(field_values[i]) self.constrained_query.add(query, BooleanClause.Occur.FILTER)
def more_like_this3(self, limit, score_logs_for_each, user_query): query = "" bench_result = [] # if not item_doc: # item_doc.append(ResultItem(None, 1.0, "No Title", 0, 0)) # if item_doc.doc: # query += self.document_to_query(item_doc.doc) query += user_query query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if query: try: parsed_query = queryparser.parse(query) hits = self.searcher.search(parsed_query, limit).scoreDocs temp = 1 for i, hit in enumerate(hits): score_logs_for_each += str(round(hit.score, 2)) + '\n' doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( parsed_query, hit.doc) # print "Matched Terms : ", matched_terms # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content")) temp += 1 file_path = doc.get("file") content = None try: with open(file_path) as f: content = f.read() except: pass if content: item = BenchResultItem_UQ(doc.get("file"), content, matched_terms, hit.score, doc.get("line_numbers"), hit.doc) bench_result.append(item) except Exception as e: print "BenchSearcher Error: %s" % e print(traceback.format_exc()) # self.searchermgr.decRef(self.searcher) # self.searchermgr.release(self.searcher) # self.searcher = None # self.directory.close() # self.directory = None return bench_result, score_logs_for_each
def query_parser_must(self, field_values, field_must=['Text']): """ The values that the fields must match :param field_values: values of the fields :param field_must: fields that must match """ assert len(field_must) == len( field_values), "Number of fields different from number of values" for i in range(len(field_must)): query_parser = QueryParser(field_must[i], self.analyzer) query = query_parser.parse(field_values[i]) self.constrained_query.add(query, BooleanClause.Occur.MUST)
class LuceneRanker(object): def __init__(self, tfidf_path, strict=True): lucene.initVM() analyzer = StandardAnalyzer() reader = DirectoryReader.open(SimpleFSDirectory(Paths.get(tfidf_path))) self.searcher = IndexSearcher(reader) self.parser = QueryParser("text", analyzer) self.parser.setDefaultOperator(QueryParser.Operator.OR) def closest_docs(self, query, k=1): """Closest docs by dot product between query and documents in tfidf weighted word vector space. """ query = self.parser.parse( query.replace('/', '//').replace('?', '').replace('"', '')) hits = self.searcher.search(query, k) docids = [] docs = [] for i, hit in enumerate(hits.scoreDocs): doc = self.searcher.doc(hit.doc) docs.append(unicode(doc['text'])) docids.append(unicode(doc['title'])) return docids, docs def batch_closest_docs(self, queries, k=1, num_workers=None): """Process a batch of closest_docs requests multithreaded.""" # get highest scoring document for multiple queries batch = [] for i, q in enumerate(queries): if i % 100 == 0: print(i) t0 = time.time() docids, docs = self.closest_docs(q, k) batch.append((docids, docs)) return batch def parse(self, query): return None def text2spvec(self, query): return None def get_doc_index(self, doc_id): return 0 def get_doc_id(self, doc_index): return 0 def __exit__(self, *args): pass
def define_search_params(STORE_DIR, FIELD_CONTENTS, TERM): store = SimpleFSDirectory(Paths.get(STORE_DIR)) reader = DirectoryReader.open(store) searcher = IndexSearcher(reader) # Get the analyzer analyzer = WhitespaceAnalyzer() # Constructs a query parser. We specify what field to search into. queryParser = QueryParser(FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(TERM) return searcher, reader, query
def getQueryBuiler(): # builder = QueryBuilder(analyzer) boolean_query = BooleanQuery.Builder() # print(args.search) if len(args.search) == 0: boolean_query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) return boolean_query for i in range(len(args.search)): curSearch = args.search[i].split(' ') if curSearch[1] == 'query': parser = QueryParser(curSearch[2], analyzer) query = parser.parse(curSearch[3]) elif curSearch[1] == 'intrange': query = IntPoint.newRangeQuery(curSearch[2], curSearch[3], curSearch[4]) elif curSearch[1] == 'termrange': lowerDate = handleDate(curSearch[3], '%d/%b/%Y:%H:%M:%S') upperDate = handleDate(curSearch[4], '%d/%b/%Y:%H:%M:%S') query = TermRangeQuery.newStringRange(curSearch[2], lowerDate, upperDate, True, True) if curSearch[0] == 'must': boolean_query.add(query, BooleanClause.Occur.MUST) elif curSearch[0] == 'should': boolean_query.add(query, BooleanClause.Occur.SHOULD) elif curSearch[0] == 'filter': boolean_query.add(query, BooleanClause.Occur.FILTER) elif curSearch[0] == 'must_not': boolean_query.add(query, BooleanClause.Occur.MUST_NOT) else: print('raise exception') # raise Exception # exit() # parser = QueryParser('method1', analyzer) # query = parser.parse('options') # boolean_query.add(query, BooleanClause.Occur.MUST) # parser = QueryParser('response_code', analyzer) # query = IntPoint.newRangeQuery('response_code', 200, 300) # boolean_query.add(query, BooleanClause.Occur.MUST) # lowerDate = handleDate("19/Jul/2020:05:40:00 +0000") # upperDate = handleDate("19/Jul/2020:06:45:04 +0000") # query = TermRangeQuery.newStringRange("date_time", lowerDate, upperDate, True, True) # boolean_query.add(query, BooleanClause.Occur.MUST) return boolean_query
def retrieve_wiki(text_query, searcher, analyzer): txt =text_query try: query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt) except: qp = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer) txt = qp.escape(txt) query = qp.parse(txt) scoreDocs = searcher.search(query, 1000).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) yield doc.get('title'), doc.get('contents')
def similarityOfSynopsis(self): directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.SYNOPSIS): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) while True: try: query = queryParser.parse( QueryParser.escape(content)) except Exception as e: self.boolean_query.setMaxClauseCount( self.boolean_query.maxClauseCount * 2) print self.boolean_query.maxClauseCount continue break topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter( first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter( first_movie=minor_movie, second_movie=major_movie).first() similarity.synopsis = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id)
def retrieve_wiki(text_query, searcher, analyzer): txt = text_query try: query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt) except: qp = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer) txt = qp.escape(txt) query = qp.parse(txt) scoreDocs = searcher.search(query, 1000).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) yield doc.get('title'), doc.get('contents')
class Searcher: def __init__(self, indexDir): self.directory = SimpleFSDirectory(Paths.get(indexDir)) self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader) self.nameQueryParser = QueryParser('name', StandardAnalyzer()) self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND) self.idQueryParser = QueryParser('id', StandardAnalyzer()) self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND) def find_by_name(self, name): query = self.nameQueryParser.parse(name) docs = self.searcher.search(query, 100).scoreDocs tables = [] for scoreDoc in docs: doc = self.searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) tables.append(table) return tables def find_by_id(self, id): query = self.idQueryParser.parse(id) docs = self.searcher.search(query, 100).scoreDocs tables = [] for scoreDoc in docs: doc = self.searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) tables.append(table) return tables def close(self): self.directory.close() self.reader.close()
def doc_search(self, keywords): analyzer = StandardAnalyzer() parser = QueryParser('Title', analyzer) query = parser.parse(keywords) try: collector = TopScoreDocCollector.create(3000) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits return hits
def author_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() rootdir = OUT_RAW_DIR from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' results = {} for hit in hits: doc = searcher.doc(hit.doc) entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) short_title = entry['short_title'] print(entry['prim_author']) if qry in entry['prim_author'].lower(): fname = short_title + CONTENT_EXT results[entry_id] = {'title': short_title, 'file': fname } f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w') f.write(json.dumps(results)) f.close() return json.dumps(results)
def perform_search(self, searchterm): # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start print scoreDocs print duration
def more_like_this(self, so_items): github_result = [] if not so_items: so_items.append(SOResultItem(None, 1.0, "No Title", 0, "")) for so_item in so_items: queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) query = "" if so_item.doc: query = self.document_to_query(so_item.doc) query += self.code_as_text() if query: print "-" * 30 print "Query: %s" % query print "-" * 30 try: like_query = queryparser.parse(query) hits = self.searcher.search(like_query, 10).scoreDocs for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( like_query, hit.doc) # apis = [d.stringValue() for d in doc.getFields("typed_method_call")] item = GithubResultItem(doc.get("file"), decompress( doc.get("file_content")), matched_terms, hit.score, so_item, doc.get("line_numbers"), hit.doc) # code github_result.append(item) #print("%d. File: %s, Matched: %s, Score: %s" % (i + 1, doc.get("file"), matched_terms, hit.score)) except Exception as e: print "Error: %s" % e # print Counter(files).most_common(5) return github_result
def more_like_this2(self, limit, item_doc, user_query, flag): #flag = UQ(1) or not(0) results = [] query = "" if flag == 1: query += user_query # item_doc = ResultItem(None, 0.0, "No Title", 'None','None', None) if flag == 0 and item_doc.doc: query += self.document_to_query(item_doc.doc) query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if query: try: parsed_query = queryparser.parse(query) hits = self.searcher.search(parsed_query, limit).scoreDocs temp = 1 for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( parsed_query, hit.doc) temp += 1 file_path = doc.get("file") content = None try: with open(file_path) as f: content = f.read() except: pass if content: item = GithubResultItem(doc.get("file"), content, matched_terms, hit.score, item_doc, doc.get("line_numbers"), hit.doc) results.append(item) except Exception as e: print "GitHub Searcher Error: %s" % e print(traceback.format_exc()) return results
def search_phrase(self, term, phrase): print('Phrase search') self.hits = [] index_list = [] parser = QueryParser('text', self.analyzer) query = parser.parse(term) hits = self.searcher.search(query, 1000).scoreDocs if hits is None: return for hit in hits: index = [] doc = self.searcher.doc(hit.doc) text = doc.get("text") phrases = doc.get("phrase") # processing with saved text and phrase terms = text.split() phrases = phrases.split() flag = 1 # this flag is judging for phrase in every target term in text index = [] # index number for searched term, maybe many terms for i in range(len(terms)): if term == terms[i]: index.append(i) if not phrase == phrases[i]: flag = 0 break if flag == 1: self.hits.append(text) index_list.append(index) self.recover_sentence(index_list) hits_copy = self.hits self.hits = [] # add font tags for terms for hit in hits_copy: simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) highLightText = highlighter.getBestFragment( self.analyzer, 'text', hit) if highLightText is not None: self.hits.append(highLightText) return self.hits[:40]
def similarityOfSynopsis(self): directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.SYNOPSIS): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) while True: try: query = queryParser.parse(QueryParser.escape(content)) except Exception as e: self.boolean_query.setMaxClauseCount(self.boolean_query.maxClauseCount * 2) print self.boolean_query.maxClauseCount continue break topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first() similarity.synopsis = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id)
def custom_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() rootdir = OUT_RAW_DIR from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' print rootdir results = {} for hit in hits: doc = searcher.doc(hit.doc) entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) short_title = entry['short_title'] year = entry['publ_year'] fname = short_title + CONTENT_EXT results[fname] = year;
def do_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File print os.path.abspath(os.path.pardir) reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' results = [] for hit in hits: doc = searcher.doc(hit.doc); entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) #print 'entry:', entry score = hit.score #print 'Hit:', entry['short_title'], score results.append((score, doc, entry)) return results
def findTopClasses(self): propertyURI = RDFS.SUBCLASSOF allClasses = list() topClasses = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.PROPERTY_FEATURE_LKB, analyzer) query = parser.parse("\"" + QueryParser.escape(propertyURI) + "\"") result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) allClasses.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 # for (String classUri : allClasses) { indexus = 0 while indexus < len(allClasses): classUri = allClasses[indexus] logging.info("Checking whether " + classUri + " is a top class.") # search inst and pred retrieve class # if class exists that means it is not top class otherwise add to # topClasses classes = self.searchForClass(classUri, propertyURI) logging.info("top classes:" + str(len(classes))) if classes != None or len(classes) > 0: logging.info("This is not a top class...") else: topClasses.append(classUri) logging.info("Adding " + classUri + " to top classes.") indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return topClasses
def closest_docs(self, question_, k=5): """Closest docs by dot product between query and documents in tfidf weighted word vector space. """ doc_scores = [] doc_ids = [] doc_texts = [] words = self.parse(utils.normalize(question_)) query = ' '.join(words) if not query: logger.warning('has no query!') return doc_ids, doc_scores, doc_texts # bq_builder = BooleanQuery.Builder() # title_query = TermQuery(Term("title", query)) # # boosted_title_query = BoostQuery(title_query, 2) # bq_builder.add(TermQuery(Term("text", query)), BooleanClause.Occur.SHOULD) # bq_builder.add(title_query, BooleanClause.Occur.SHOULD) # lucene_query = bq_builder.build() # lucene_query = self.query_parser.parse(query, ["title", "text"], # [BooleanClause.Occur.SHOULD, BooleanClause.Occur.MUST], # self.analyzer) # lucene_query = 'title:"{0}"^2 OR "{0}"'.format(query) self.env.attachCurrentThread() query_parser = QueryParser("text", self.analyzer) search_results = self.searcher.search(query_parser.parse(query), k).scoreDocs for search_result in search_results: doc = self.searcher.doc(search_result.doc) doc_id = doc["id"] + ", title=" + doc["title"] doc_score = search_result.score text = doc["text"] doc_ids.append(doc_id) doc_scores.append(doc_score) doc_texts.append(text) # print('id:', doc_id, 'ds:', doc_score, 'text:', text) # logger.debug('question_d:%s, query:%s, doc_ids:%s, doc_scores:%s' # % (question_, query, doc_ids, doc_scores)) return doc_ids, doc_scores, doc_texts
def search(termo, **args): indexDir = os.environ.get('MANDEX') or '3iteracao' fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, field, analyzer) parser.setDefaultOperator(QueryParser.Operator.OR) query = parser.parse(termo + ' '.join(args.values())) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start politicos = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) politicos.append(table) return politicos
def get_document_vector(searcher, reader, document_id, \ id_field, text_field): ''' Given a document id, fetch the tf-idf vector of the document. ''' tc_dict = {} # Counts of each term dc_dict = {} # Number of docs associated with each term tfidf_dict = {} # TF-IDF values of each term in the doc # Get the document id. query_parser = QueryParser(id_field, WhitespaceAnalyzer()) score_docs = searcher.search(query_parser.parse(str(document_id)), 1).scoreDocs if len(score_docs) > 0: # get the tf-idf vector. termVector = reader.getTermVector(score_docs[0].doc, text_field) termsEnumvar = termVector.iterator() termsref = BytesRefIterator.cast_(termsEnumvar) N_terms = 0 try: while (termsref.next()): termval = TermsEnum.cast_(termsref) fg = termval.term().utf8ToString() # Term in unicode if len(fg) > 3 and not fg.isdigit(): tc = termval.totalTermFreq() # Term count in the doc # Number of docs having this term in the index dc = reader.docFreq(Term(text_field, termval.term())) N_terms = N_terms + 1 tc_dict[fg] = tc dc_dict[fg] = dc except: print('error in term_dict') # Compute TF-IDF for each term for term in tc_dict: tf = tc_dict[term] / N_terms idf = 1 + math.log(reader.numDocs() / (dc_dict[term] + 1)) tfidf_dict[term] = tf * idf return tfidf_dict
def search(self, term, window=2): self.hits = [] index_list = [] sort_para = term parser = QueryParser('text', self.analyzer) query = parser.parse(term) print(query) # Jump to multi-terms search if there are several words if self.multi_terms(query): self.search_multi_terms(query) return self.hits[:40] hits = self.searcher.search(query, 1000).scoreDocs for hit in hits: index = [] doc = self.searcher.doc(hit.doc) text = doc.get("text") self.hits.append(text) # save indexes of target term in each document terms = text.split() for i in range(len(terms)): if term == terms[i]: index.append(i) index_list.append(index) self.recover_sentence(index_list, window) hits_copy = self.hits self.hits = [] for hit in hits_copy: simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) highLightText = highlighter.getBestFragment( self.analyzer, 'text', hit) if highLightText is not None: self.hits.append(highLightText) print('search over') return self.hits[:40]
def doc_search(self, field, keywords, numHits): if field != 'All': analyzer = StandardAnalyzer() parser = QueryParser(field, analyzer) query = parser.parse(keywords) # self.lReader.getDocCount("title"); try: collector = TopScoreDocCollector.create(numHits) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits self.field = field return hits else: analyzer = WhitespaceAnalyzer() parser = MultiFieldQueryParser(['Title', 'Body'], analyzer) query = MultiFieldQueryParser.parse(parser, keywords) # self.lReader.getDocCount("title"); try: collector = TopScoreDocCollector.create(numHits) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits self.field = field return hits self.hits = hits self.field = field return hits
def more_like_this2(self, limit, score_logs_for_each, user_query, flag): bench_result = [] query = "" if flag == 1: query += user_query query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if query: try: parsed_query = queryparser.parse(query) hits = self.searcher.search(parsed_query, limit).scoreDocs temp = 1 for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched = doc.get('file').split('/')[9].split('.')[0] score_logs_for_each += str(matched) + '\t' + str(round(hit.score, 2)) + '\n' matched_terms = self.get_matched_keywords2(parsed_query, hit.doc) temp += 1 file_path = doc.get("file") content = None try: with open(file_path) as f: content = f.read() except: pass if content: item = BenchResultItem_UQ(doc.get("file"), content, matched_terms, hit.score, doc.get("line_numbers"), hit.doc) bench_result.append(item) except Exception as e: print "BenchSearcher Error: %s" % e print(traceback.format_exc()) return bench_result, score_logs_for_each
def extract_phrase_query(self, q, field, slop=0, boost=5): phrases = re.findall(r'"([^"]*)"', q) if len(phrases) == 0: return None, q q = re.sub(r'"([^"]*)"', "", q).strip() # query without phrases if self.verbose: print "Detected phrases: ", phrases bq = BooleanQuery() for phrase in phrases: # pq = PhraseQuery() # for term in filter(None, phrase.split(' ')): # pq.add(Term(field, term)) qparser = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer) # parse phrase - this may or may not be desired # pq = qparser.parse(field + ':"' + phrase + '"') pq = qparser.parse('%s "%s"~%d^%.1f' % (phrase, phrase, slop, boost)) # phrase queries have high priority bq.add(pq, BooleanClause.Occur.MUST) # bq.add(pq, BooleanClause.Occur.SHOULD) return bq, q
def more_like_this2( self, item_doc, result_num ): #들어온 질문 docs들에 대해 순회하면서 최종 query로 생성하고 Question Index에서 비슷한거 검색할 것. similar_questions = [] if not item_doc: item_doc.append(ResultItem(None, 1.0, "No Title", 0)) query = "" if item_doc.doc: query += self.document_to_query(item_doc.doc) query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "term", self.analyzer) if query: #########이 시점에서의 Unified Query는 Tokenization, Stemming 이 되어있음..######## try: like_query = queryparser.parse(query) hits = self.searcher.search( like_query, result_num).scoreDocs #Q와 비슷한 Q들 상위 3개씩의 결과 그럼 총 9개 for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) similar_questions.append(doc.get("question_id")) except Exception as e: print "Question Searcher: Error: %s" % e # write_search_log("Question Searcher: Error: %s" % e + "\n") print(traceback.format_exc()) # self.searchermgr.decRef(self.searcher) # self.searchermgr.release(self.searcher) # self.searcher = None # self.directory.close() # self.directory = None return similar_questions
def get_doc_list(TERM, searcher, reader): FIELD_CONTENTS = "text" DOC_NAME = "identifier" STORE_DIR = "./full_index1" # Get the analyzer analyzer = WhitespaceAnalyzer() # Constructs a query parser. We specify what field to search into. queryParser = QueryParser(FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(TERM) #lucene.initVM() #searcher, reader, query = define_search_params(STORE_DIR, FIELD_CONTENTS, TERM) # fieldInfos = MultiFields.getMergedFieldInfos(reader) # print(fieldInfos) # for fieldInfo in fieldInfos.iterator(): # print(fieldInfo.name) # Run the query and get documents that contain the term return searcher.search(query, reader.numDocs())
class TASearcher(): def __init__(self, queries=[], criteria=[], conjunctions=[], orderby=["ta"], ascending=True, limit=10000): vm.attachCurrentThread() self.queries = [query for query in queries if len(query.strip()) > 0] self.criteria = criteria self.conjunctions = conjunctions self.orderby = orderby self.ascending = ascending self.queryString = "" self.limit = limit self.fields = fields self.analyzer = PorterStemmerAnalyzer() self.queryParser = QueryParser(Version.LUCENE_30, "freetext", self.analyzer) self.queryParser.setAllowLeadingWildcard(True) self.queryParser.setDefaultOperator(QueryParser.Operator.AND) indexDir = settings.LUCENE_INDEX_DIRECTORY self.index = MMapDirectory(File(indexDir)) def createQueryString(self): # Simple if len(self.criteria) == 0: self.queryString = "(%s) OR freetext-normalized:(%s)" % (self.queries[0], self.queries[0]) # Advanced else: queryPairs = [] criteriaQueries = zip(self.criteria, self.queries) self.criteria = dict(criteriaQueries).keys() for criterion, query in criteriaQueries: if criterion in ("volume", "number", "category-label", "pubtype", "author-sort"): queryPairs.append("%s:%s" % (criterion, query)) elif criterion == "year": queryPairs.append("year-start:%s OR year-end:%s" % (query, query)) else: queryPairs.append('%s:%s OR %s-normalized:%s' % (criterion, query, criterion, query)) # queryPairs = ["%s:%s"%(criterion,query.replace(" ", "+")) for criterion, query in zip(criteria, queries)] try: queryString = "%s %s" % (queryPairs[0], " ".join( ["%s (%s)" % (conj, pair) for conj, pair in zip(self.conjunctions, queryPairs[1:])])) self.queryString = queryString return queryString except: self.queryString = "freetext" return self.queryString def getQueryString(self): return self.queryString def _getHits(self): reader = IndexReader.open(self.index) searcher = IndexSearcher(reader) # Sortierung nach Band- und Eintragsnummer (4: Wert als Integer behandeln) sortDict = { "ta": (("volume", SortField.Type.INT), ("number", SortField.Type.INT)), "year": (("year-start", SortField.Type.INT), ("year-end", SortField.Type.INT)), "author-title": (("author-sort", SortField.Type.STRING), ("title-sort", SortField.Type.STRING)), "title": (("title-sort", Locale.GERMAN),), "author": (("author-sort", Locale.GERMAN),), } sortFields = [] reverse = not self.ascending for name in self.orderby: for fieldName, typeNum in sortDict.get(name, []): sortFields.append(SortField(fieldName, typeNum, reverse)) if len(sortFields) == 0: sortFields = [SortField("volume", SortField.Type.INT), SortField("number", SortField.Type.INT)] sort = Sort(sortFields) topDocs = searcher.search(self.query, None, 80000, sort) hits = topDocs.scoreDocs self.hits = hits self.searcher = searcher lang = translation.get_language() if lang != "de": lang = "en" facets = {"author": {}, "pubtype": {}, "category-%s" % lang: {}} # Highlighting highlighter = Highlighter(SimpleHTMLFormatter('<span class="highlight">', '</span>'), QueryScorer(self.query)) hitObjects = [] fields = {} for hit in hits: doc = searcher.doc(hit.doc) # print unicode(doc) fields["score"] = hit.score fields["volume"] = doc["volume"] fields["number"] = doc["number"] fields["id"] = doc["id"] fields["title"] = doc["title"] fields["author"] = doc["author"] fields["authors"] = [field.stringValue() for field in doc.getFields("author")] for author in fields["authors"]: # XXX facets["author"][author] = facets["author"].get(author, 0) + 1 # XXX fields["categories"] = [field.stringValue() for field in doc.getFields("category-%s" % lang)] for cat in fields["categories"]: facets["category-%s" % lang][cat] = facets["category-%s" % lang].get(cat, 0) + 1 maxNumFragmentsRequired = 2 fragmentSeparator = "..."; pubtype = doc["pubtype"] fields["pubtype"] = pubtype facets["pubtype"][pubtype] = facets["pubtype"].get(pubtype, 0) + 1 fields["city"] = doc["city"] fields["year"] = doc["year-start"] if fields["year"] and doc["year-end"] and doc["year-end"] != fields["year"]: fields["year"] += " - " + doc["year-end"] highlightFields = ("title", "author", "city", "year", "category") if "freetext" in self.criteria: for fieldName in highlightFields: try: tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName])) newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired, fragmentSeparator) if len(newVal) > 0: # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal) fields[fieldName] = newVal except: continue for fieldName in highlightFields: if fieldName in self.criteria or fieldName + "-de" in self.criteria or fieldName + "-en" in self.criteria: try: tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName])) newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired, fragmentSeparator) if len(newVal) > 0: # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal) fields[fieldName] = newVal except: continue """if "author" in self.criteria: try: tokenStream = self.analyzer.tokenStream("author", lucene.StringReader(fields["author"])) fields["author"] = highlighter.getBestFragments(tokenStream, fields["author"], maxNumFragmentsRequired, fragmentSeparator) except: pass""" hitObjects.append( Hit(fields["id"], fields["volume"], fields["number"], fields["title"], fields["author"], fields["city"], fields["year"], fields["categories"], fields["pubtype"], fields["score"])) facetsToDelete = [] for facet in facets: if len(facets[facet]) < 2: facetsToDelete.append(facet) continue values = sorted(facets[facet].items(), key=itemgetter(0)) values = sorted(values, key=itemgetter(1), reverse=True) facets[facet] = values[:25] for facet in facetsToDelete: del facets[facet] self.facets = facets reader.close() self.hitObjects = hitObjects return hitObjects def search(self): self.createQueryString() querystr = self.getQueryString() self.query = self.queryParser.parse(querystr) return self._getHits() def getAll(self): self.query = MatchAllDocsQuery() return self._getHits()
class HighlighterTestCase(PyLuceneTestCase): """ Unit tests ported from Java Lucene. 2004 by Yura Smolsky ;) """ FIELD_NAME = "contents" texts = [ "A wicked problem is one for which each attempt to create a solution changes the understanding of the problem. Wicked problems cannot be solved in a traditional linear fashion, because the problem definition evolves as new possible solutions are considered and/or implemented." "Wicked problems always occur in a social context -- the wickedness of the problem reflects the diversity among the stakeholders in the problem." "From http://cognexus.org/id42.htm" "Most projects in organizations -- and virtually all technology-related projects these days -- are about wicked problems. Indeed, it is the social complexity of these problems, not their technical complexity, that overwhelms most current problem solving and project management approaches." "This text has a typo in referring to whicked problems" ]; def __init__(self, *args): super(HighlighterTestCase, self).__init__(*args) self.parser = QueryParser(Version.LUCENE_CURRENT, self.FIELD_NAME, StandardAnalyzer(Version.LUCENE_CURRENT)) def setUp(self): super(HighlighterTestCase, self).setUp() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) writer = self.getWriter(analyzer=self.analyzer) for text in self.texts: self.addDoc(writer, text) writer.commit() writer.close() self.reader = self.getReader() self.numHighlights = 0; def testSimpleHighlighter(self): self.doSearching("Wicked") highlighter = Highlighter(QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) maxNumFragmentsRequired = 2 for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...") print "\t", result # Not sure we can assert anything here - just running to check we don't # throw any exceptions def testGetBestFragmentsSimpleQuery(self): self.doSearching("Wicked") self.doStandardHighlights() self.assert_(self.numHighlights == 3, ("Failed to find correct number of highlights, %d found" %(self.numHighlights))) def doSearching(self, queryString): self.searcher = self.getSearcher() self.query = self.parser.parse(queryString) # for any multi-term queries to work (prefix, wildcard, range, # fuzzy etc) you must use a rewritten query! self.query = self.query.rewrite(self.reader) print "Searching for:", self.query.toString(self.FIELD_NAME) self.scoreDocs = self.searcher.search(self.query, 100).scoreDocs self.numHighlights = 0 def doStandardHighlights(self): formatter = TestFormatter(self) highlighter = Highlighter(formatter, QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(20)) for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) maxNumFragmentsRequired = 2 fragmentSeparator = "..." tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator) print "\t", result def countHighlightTerm(self): self.numHighlights += 1 # update stats used in assertions def addDoc(self, writer, text): d = Document() f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED) d.add(f) writer.addDocument(d)
def run(command,searcher, aWrapper): print if command == '': return #debug #print "Searching for:"+command #query = MultiFieldQueryParser(Version.LUCENE_CURRENT,['subject_id','summary'],analyzer).parse(command); #query = MultiFieldQueryParser.parse(command,['subject_id','summary'],analyzer); #''' #MultiFieldQueryParser(Version matchVersion, String[] fields, Analyzer analyzer) #''' #parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, JArray('string')(['subject_id','summary']),analyzer) #query = MultiFieldQueryParser.parse(parser, command_jarr) #创建QueryParser对象 默认的搜索域为title parser = QueryParser(Version.LUCENE_CURRENT, "title", aWrapper) #A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing and query parsing. query = parser.parse(command) print query.toString() #test the analyzerWrapper #printTokens(aWrapper,command,'title') #printWrappedAnalyzer(aWrapper) #所有有相关度的doc,进行排序 #sortField = SortField('boost',SortField.Type.FLOAT,True) #True表示降序 #sort = Sort(sortField) ''' Error with: > query = lucene.MultiFieldQueryParser(lucene.Version.LUCENE_CURRENT, > ["payload","subject"], analyzer).parse(command) I think there's a bug with the method binding. MultiFieldQueryParser has several static parse methods, plus the inherited regular method from QueryParser. It looks like all of them are being resolved as if they were static. As a workaround, you can call it like this: parser = lucene.MultiFieldQueryParser(lucene.Version.LUCENE_CURRENT, ["payload","subject"], analyzer) lucene.MultiFieldQueryParser.parse(parser, command) ''' #occ=[BooleanClause.Occur.SHOULD , BooleanClause.Occur.SHOULD] #query = MultiFieldQueryParser.parse(command_list,['subject_id','summary'],occ,analyzer) #query = QueryParser(Version.LUCENE_CURRENT, FIELD,analyzer).parse(command) #scoreDocs = searcher.search(query, 50,sort).scoreDocs scoreDocs = searcher.search(query, 50).scoreDocs # retList = [] # for scoreDoc in scoreDocs: # doc = searcher.doc(scoreDoc.doc) # score = scoreDoc.score # #print 'subject_id:', doc.get('subject_id') # #print 'title:', doc.get('title') # tmpDict = { # 'subject_id':doc.get('subject_id'), # 'title':doc.get('title'), # 'directors':doc.get('directors'), # 'summary':doc.get('summary'), # 'image_small':doc.get('image_small'), # 'boost':doc.get('boost'), # 'user_tags':doc.get('user_tags'), # 'year':doc.get('year'), # 'score':score # } # retList.append(tmpDict) maxDict = utils.maxDict movieDictList = utils.scoreDocs2dictList(scoreDocs,searcher) retList = movieDictList retList = utils.reRank(movieDictList,maxDict,command) #人工排序 #retList = sorted(retList, key=operator.itemgetter('boost'), reverse=True) del searcher return retList
def searchStemFirst(self, annotation): annotations = list() pocString = QueryParser.escape(annotation.getText()) preparePocStringOriginal = "\"" + pocString + "\"" preparePocStringLowercase = "\"" + pocString.lower() + "\"" try: maxSynonyms = 0 # Analyzer stemmedAnalyser = # AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)), # synonymMap, maxSynonyms); stemmedAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT) analyser = StandardAnalyzer(Version.LUCENE_CURRENT) stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemmedAnalyser) query = stemParser.parse(preparePocStringLowercase) result = self._searcher.search(query, 1) logging.info("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) stemHits = result.scoreDocs allHits = stemHits # if(stemHits.length == 0) { # search lowercased exact parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser) query = parser.parse(preparePocStringLowercase) result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) lowHits = result.scoreDocs allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(lowHits) # ArrayUtils.addAll(allHits, lowHits) logging.info("For " + str(query) + " : " + str(result.totalHits)) # } # if(allHits.length == 0) { # search exact exactParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser) query = exactParser.parse(preparePocStringLowercase) result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(result.scoreDocs) #ArrayUtils.addAll(allHits, result.scoreDocs) logging.info("For " + str(query) + " : " + str(result.totalHits)) # } # for (ScoreDoc hit : allHits) { indexus = 0 while indexus < len(allHits): hit = allHits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) ann = Annotation() features = dict() features[FreyaConstants.CLASS_FEATURE_LKB] = doc.get(FreyaConstants.CLASS_FEATURE_LKB) features[FreyaConstants.INST_FEATURE_LKB] = doc.get(FreyaConstants.INST_FEATURE_LKB) features[FreyaConstants.PROPERTY_FEATURE_LKB] = doc.get(FreyaConstants.PROPERTY_FEATURE_LKB) features["string"] = doc.get(FreyaConstants.FIELD_EXACT_CONTENT) features["score"] = hit.score ann.setFeatures(features) ann.setEndOffset(annotation.getEndOffset()) ann.setStartOffset(annotation.getStartOffset()) ann.setSyntaxTree(annotation.getSyntaxTree()) ann.setText(annotation.getText()) annotations.append(ann) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return annotations
format = a elif o == "--index": indexDir = a elif o == "--stats": stats = True class CustomTemplate(Template): delimiter = '#' template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("keywords", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(' '.join(args)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start if stats: print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) print template.substitute(table)
from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version # This script will prepare a CSV for Cassandra DB if __name__ == '__main__': lucene.initVM() base_dir = os.path.abspath(os.path.curdir) index_file = os.path.join(base_dir, INDEX_DIR) store = SimpleFSDirectory(File(index_file)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) reader = IndexReader.open(store) searcher = IndexSearcher(reader) query_parser = QueryParser(Version.LUCENE_CURRENT, "netflix_id", analyzer) with open(sys.argv[1], 'r') as ratings: for line in ratings: user_id, netflix_id, score = line.split(",") query = query_parser.parse(netflix_id) scoreDocs = searcher.search(query, 1).scoreDocs if scoreDocs: doc = searcher.doc(scoreDocs[0].doc) film_id = doc.getField("id").stringValue() print "{0},{1},{2}".format(user_id, film_id, score),
def search(request): query = request.GET.get('q', None) page = int(request.GET.get('page', 1)) perPage = 5 nodes = [] usage = {} usage["time"] = time.time() if not query: count = 0 nodes = [] keywords = [] else: #conn = ReplicaSetConnection('localhost', replicaSet='jlu') conn = MongoClient('localhost') db = conn.sina #db.read_preference = ReadPreference.SECONDARY CACHE = db.cache keywords = query.split(' ') cache = CACHE.find_one({"query":keywords,"page":page}) if cache == None: print "query cache not found" VM_ENV.attachCurrentThread() fsDir = SimpleFSDirectory(File(settings.ROOT_DIR+'/index')) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, 'text', analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) lucene_query = parser.parse(query) scoreDocs = searcher.search(lucene_query, 3000000).scoreDocs ids = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) for field in doc.getFields(): ids.append(field.stringValue()) print "got ids from lucene",len(ids) ids = [int(x) for x in ids] NODES = conn.sina.nodes count = 0 for n in NODES.find({"node_id":{"$in":ids}}).sort("in_degree",-1).skip((page-1)*perPage): count += 1 print "doing",n["node_id"],count,"/",perPage n["js"] = similarity(n["node_id"],topk=10) nodes.append(n) if len(nodes) == perPage: break count = len(ids) CACHE.insert({"query":keywords,"page":page,"cache":nodes,"count":len(ids)}) usage["isCache"] = False else: print "found query cache" usage["isCache"] = True nodes = cache["cache"] count = cache["count"] pagenav = {} if page == 1: pagenav["has_pre"] = None else: pagenav["has_pre"] = page - 1 if page > count/perPage: pagenav["has_next"] = None else: pagenav["has_next"] = page + 1 pagenav["page"] = page usage["time"] = time.time() - usage["time"] return { 'q' : request.GET.get('q', ''), 'keywords' : keywords, 'nodes' : nodes, 'count' : count, 'page' : pagenav, 'usage' : usage, }
def run(command, searcher, aWrapper, use_custom_parser=False, debug=False): if not os.path.isdir(query_log_dir): os.mkdir(query_log_dir) search_start = time.time() query_log_file = os.path.join(query_log_dir, 'query_log.%s' % datetime.now().strftime('%Y-%m-%d')) fw = open(query_log_file, 'a+') cur_time = datetime.now() cur_time = cur_time + timedelta(hours=8)#解决时区不在中国的问题 fw.write('\n*********query-log,time=%s*************\n' % cur_time.strftime('%Y-%m-%d %H:%M:%S') ) fw.write('raw_str=%s\n' % unicode_to_str(command)) if command == '': return #debug #print "Searching for:"+command #query = MultiFieldQueryParser(Version.LUCENE_CURRENT,['subject_id','summary'],analyzer).parse(command); #query = MultiFieldQueryParser.parse(command,['subject_id','summary'],analyzer); #''' #MultiFieldQueryParser(Version matchVersion, String[] fields, Analyzer analyzer) #''' #parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, JArray('string')(['subject_id','summary']),analyzer) #query = MultiFieldQueryParser.parse(parser, command_jarr) if debug: print 'before query parser: ', command command = custom_parser.parse(command) if use_custom_parser else command fw.write('parsed_str=%s\n' % unicode_to_str(command)) if debug: print 'after query parser: ', command #创建QueryParser对象 默认的搜索域为title parser = QueryParser(Version.LUCENE_CURRENT, "title", aWrapper) #A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing and query parsing. query = parser.parse(command) if debug: print 'after lucene QueryParser: ', query.toString().encode('utf8') fw.write('lucene_str=%s\n' % unicode_to_str(query.toString())) #test the analyzerWrapper #printTokens(aWrapper,command,'title') #printWrappedAnalyzer(aWrapper) #所有有相关度的doc,进行排序 #sortField = SortField('boost',SortField.Type.FLOAT,True) #True表示降序 #sort = Sort(sortField) ''' Error with: > query = lucene.MultiFieldQueryParser(lucene.Version.LUCENE_CURRENT, > ["payload","subject"], analyzer).parse(command) I think there's a bug with the method binding. MultiFieldQueryParser has several static parse methods, plus the inherited regular method from QueryParser. It looks like all of them are being resolved as if they were static. As a workaround, you can call it like this: parser = lucene.MultiFieldQueryParser(lucene.Version.LUCENE_CURRENT, ["payload","subject"], analyzer) lucene.MultiFieldQueryParser.parse(parser, command) ''' #occ=[BooleanClause.Occur.SHOULD , BooleanClause.Occur.SHOULD] #query = MultiFieldQueryParser.parse(command_list,['subject_id','summary'],occ,analyzer) #query = QueryParser(Version.LUCENE_CURRENT, FIELD,analyzer).parse(command) #scoreDocs = searcher.search(query, 50,sort).scoreDocs retN = 50 start_time = time.time() scoreDocs = searcher.search(query, retN).scoreDocs cost_time = time.time() - start_time # retList = [] # for scoreDoc in scoreDocs: # doc = searcher.doc(scoreDoc.doc) # score = scoreDoc.score # #print 'subject_id:', doc.get('subject_id') # #print 'title:', doc.get('title') # tmpDict = { # 'subject_id':doc.get('subject_id'), # 'title':doc.get('title'), # 'directors':doc.get('directors'), # 'summary':doc.get('summary'), # 'image_small':doc.get('image_small'), # 'boost':doc.get('boost'), # 'user_tags':doc.get('user_tags'), # 'year':doc.get('year'), # 'score':score # } # retList.append(tmpDict) maxDict = utils.maxDict movieDictList = utils.scoreDocs2dictList(scoreDocs,searcher) retList = movieDictList retList = utils.reRank(movieDictList,maxDict,command) #人工排序 #retList = sorted(retList, key=operator.itemgetter('boost'), reverse=True) fw.write('***********return list(search/total=%.2fs/%.2fs)***************\n' % (cost_time, time.time() - search_start)) for r in retList: line = '%s: %s, boost->%s||score=%s\n' % (r['subject_id'], r['title'], r['boost'], r['score']) fw.write(unicode_to_str(line)) fw.write('**************************************************************\n\n') del searcher return retList[:20] if debug else retList
def searchIndex(self, annotation, specialTreatment): if specialTreatment: return self.searchStemFirst(annotation) annotations = list() #ArrayList[Annotation]() try: maxSynonyms = 0 stemAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT) # Analyzer stemmedAnalyser = AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)), # synonymMap, maxSynonyms); analyser = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser) pocString = QueryParser.escape(annotation.getText()) preparePocString = "\"" + pocString + "\"" preparePocStringLowercase = "\"" + pocString.lower() + "\"" query = parser.parse(preparePocString) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.debug("For " + str(query) + " : " + str(result.totalHits)) if freq <= 0: # search lowercased exact lowerCasedParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser) query = lowerCasedParser.parse(preparePocStringLowercase) # logging.info("Searching for: " + query.toString()); result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.debug("For " + str(query) + " : " + str(result.totalHits)) if len(hits) == 0 and preparePocStringLowercase.index(" ") < 0: # search stemmed stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemAnalyser) query = stemParser.parse(preparePocStringLowercase) # logging.info("Searching for: " + query.toString()); result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.info("For " + str(query) + " : " + str(result.totalHits)) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) ann = Annotation() features = dict() features[FreyaConstants.CLASS_FEATURE_LKB]=doc.get(FreyaConstants.CLASS_FEATURE_LKB) features[FreyaConstants.INST_FEATURE_LKB]=doc.get(FreyaConstants.INST_FEATURE_LKB) features[FreyaConstants.PROPERTY_FEATURE_LKB]=doc.get(FreyaConstants.PROPERTY_FEATURE_LKB) features["string"]=doc.get(FreyaConstants.FIELD_EXACT_CONTENT) features[FreyaConstants.SCORE]=hit.score ann.setFeatures(features) ann.setEndOffset(annotation.getEndOffset()) ann.setStartOffset(annotation.getStartOffset()) ann.setSyntaxTree(annotation.getSyntaxTree()) ann.setText(annotation.getText()) annotations.append(ann) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return annotations