def searchKey(self, key , rank = None): query = "" try: MAX = 100000 qp = QueryParser(Version.LUCENE_35, "key", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(key) # print ("query",query) hits = searcher.search(query, MAX) sentence_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) try: sentence_list.append(eval(doc.get("sentence").encode("utf-8"))) except: print doc.get("sentence") return sentence_list except: print("Fail in receiving sentence with term "+key) print ("query",query) print "Unexpected error:", sys.exc_info()[0] # raw_input("wait") print return []
def search(r, keyword=""): import logging logger = logging.getLogger("search") bench = Benchmark(logger) from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit import lucene, os os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17" lucene.initVM(lucene.CLASSPATH) directory = FSDirectory.open(File(CONFIG.INDEX_PATH)) ROBOT_INDEX = IndexSearcher(directory, True) ROBOT_ANALYZER = StandardAnalyzer() keyword = keyword or r.GET["keyword"] query = QueryParser("context", ROBOT_ANALYZER) query = query.parse('"%s"' % keyword) bench.start_mark("search") hits = ROBOT_INDEX.search(query) count = len(hits) result = [] i = 0 for hit in hits: i += 1 if i > 100: break doc = Hit.cast_(hit).getDocument() result.append(SearchResult(doc, i, keyword)) ROBOT_INDEX.close() et = bench.stop_mark() return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
def query(indexName, queryString): indSearcher = IndexSearcher(SimpleFSDirectory(File(indexName))) qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT)) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(queryString.replace("-","_")) aux = indSearcher.search(query, 100) results = aux.scoreDocs hits = aux.totalHits ir = indSearcher.getIndexReader() #results = collector.topDocs() i = 0 res = [] for r in results: doc = ir.document(i) res.insert(i, doc.get('id')) i+=1 return res
def testPrefixQuery(self): parser = QueryParser(Version.LUCENE_CURRENT, "category", StandardAnalyzer(Version.LUCENE_CURRENT)) parser.setLowercaseExpandedTerms(False) print parser.parse("/Computers/technology*").toString("category")
def search(self, string ,special = None): query = "" try: MAX = 100000 #for dates such as 1931.08.06 string = string.replace("."," ") array = re.findall(r'[\w\s]+',string) string = "" for item in array: string+=item qp = QueryParser(Version.LUCENE_35, "title", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(string) # print ("query",query) hits = searcher.search(query, MAX) sentence_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) sentence_list.append(doc.get("title").encode("utf-8")) return sentence_list except: print("Fail in receiving sentence with term "+string) print ("query",query) print "Unexpected error:", sys.exc_info()[0] # raw_input("wait") print return []
def searchXYPair(self,x,y): """ Returns all sentences, which are tagged with the given two entities (x,y) """ tmp_hm = {} if x == "" or y == "": return [] try: array = re.findall(r'[\w\s]+',x) x = "" for item in array: x+=item qp = QueryParser(Version.LUCENE_35, "X", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(x) MAX = 100000 result_list = [] hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) y_entry = doc["Y"] if y_entry == y: tmp_hm[doc["Sentence"]]="" for key in tmp_hm: result_list.append(IndexUtils.sentence_wrapper(key)) tmp_hm = {} return result_list except: print("Fail (search XYPair) in x:"+x+" y:"+y) print "Unexpected error:", sys.exc_info()[0] print return []
def searchForDbpediaURI(self, uri): """ Returns all anchor texts, which are related to the given DBpedia URI. Also returns for each anchor text the corresponding URI and the number of how often the anchor appears on the english Wikipedia """ uri_old = uri uri = uri.replace("http://dbpedia.org/resource/","") array = re.findall(r'[\w\s]+',uri) uri = "" for item in array: uri+=item try: qp = QueryParser(Version.LUCENE_35, "dbpedia_uri", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(uri) MAX = 10000 result = [] hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) dbpedia_uri = doc["dbpedia_uri"].encode("utf-8") if dbpedia_uri == uri_old: result.append([doc["anchor"].encode("utf-8"), doc["anchor_uri"].encode("utf-8"), dbpedia_uri, doc["number"].encode("utf-8")]) return result except: print("searchForDbpediaURI - Fail in uri: "+uri) return []
def searchForDbpediaURI(self, uri): """ Returns all sentences, which are tagged with the given DBpedia URI """ print "in searchForDbpediaURI" uri_old = uri uri = uri.replace("http://dbpedia.org/ontology/","") uri = uri.replace("http://dbpedia.org/property/","") uri = uri.replace("http://dbpedia.org/resource/","") array = re.findall(r'[\w\s]+',uri) uri = "" for item in array: uri+=item try: qp = QueryParser(Version.LUCENE_35, "URI", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(uri) print "query: "+str(query) MAX = 500000 result = [] hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) dbpedia_uri = doc["URI"] if dbpedia_uri == uri_old: result.append([IndexUtils.sentence_wrapper(doc["Sentence"]), doc["X"], doc["Y"],dbpedia_uri]) return result except: print("Fail in uri: "+uri) print "Unexpected error:", sys.exc_info()[0] return result
def does_line_existNew(self,line,x,y): """ Checks, if parsed sentence already exists in index """ query = "" try: array = re.findall(r'[\w]+',line) string = "" for item in array: string+=item+" " qp = QueryParser(Version.LUCENE_35, "Sentence", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(string) MAX = 10 hits = searcher.search(query, MAX) if len(hits.scoreDocs)>0: return True else: return False except Exception: s_tmp = str(sys.exc_info()) if "too many boolean clauses" in s_tmp: print "too many boolean clauses" """ Returns true, so that the sentence is not added each time, to avoid further error messages. Only occours with very large sentences. """ return True else: print "Unexpected error:", sys.exc_info()[0] print "in does line exist" print s_tmp return False
def searchString(self, string): 'searches for a string and returns an array of POS-tagged sentences' query = "" #print("Input String: ",string) try: MAX = 100000 #for dates such as 1931.08.06 string = string.replace("."," ") array = re.findall(r'[\w\s]+',string) string = "" for item in array: string+=item #print("Input String2: ",string) qp = QueryParser(Version.LUCENE_35, "sentence", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(string) #print ("query",query) hits = searcher.search(query, MAX) #print len(hits) sentence_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) #print doc.get("sentence") sentence_list.append(eval(doc.get("sentence").encode("utf-8"))) return sentence_list except: print("Fail in receiving sentence with term "+string+" in search term") print ("query",query) print "Unexpected error:", sys.exc_info()[0] # raw_input("wait") print return []
def main(cls, argv): allBooks = MatchAllDocsQuery() parser = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)) query = BooleanQuery() query.add(allBooks, BooleanClause.Occur.SHOULD) query.add(parser.parse("java OR action"), BooleanClause.Occur.SHOULD) indexDir = System.getProperty("index.dir") directory = SimpleFSDirectory(File(indexDir)) example = SortingExample(directory) example.displayResults(query, Sort.RELEVANCE) example.displayResults(query, Sort.INDEXORDER) example.displayResults(query, Sort(SortField("category", SortField.STRING))) example.displayResults(query, Sort(SortField("pubmonth", SortField.INT, True))) example.displayResults(query, Sort([SortField("category", SortField.STRING), SortField.FIELD_SCORE, SortField("pubmonth", SortField.INT, True)])) example.displayResults(query, Sort([SortField.FIELD_SCORE, SortField("category", SortField.STRING)])) directory.close()
def testPhraseQuery(self): analyzer = StandardAnalyzer(Version.LUCENE_24) q = QueryParser(Version.LUCENE_24, "field", analyzer).parse('"This is Some Phrase*"') self.assertEqual('"some phrase"', q.toString("field"), "analyzed") q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"term"') self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
def main(cls): query = QueryParser(Version.LUCENE_CURRENT, "content", cls.synonymAnalyzer).parse('"fox jumps"') print "\"fox jumps\" parses to ", query.toString("content") print "From AnalyzerUtils.tokensFromAnalysis: " AnalyzerUtils.displayTokens(cls.synonymAnalyzer, "\"fox jumps\"") print ''
def extractFeatureQueryWords(query): import string from lucene import Document, TermQuery, Term # create analyzer aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: file = open('../features.txt', 'r') featurelist = [] for line in file.readlines(): words_in_line = line.split() featurelist += words_in_line querywordlist = query.split() featureQueryList = [] productQueryList = [] for word in querywordlist: if word in featurelist: featureQueryList.append(word) else: # create parser for word aux_parser = QueryParser(Version.LUCENE_CURRENT, "title", aux_analyzer) aux_query = aux_parser.parse(word) scoreDocs = searcher.search(aux_query, 50).scoreDocs if scoreDocs: productQueryList.append(word) featureQuery = "" if featureQueryList: featureQuery = "(" for i in range(len(featureQueryList)): if i == len(featureQueryList) - 1: featureQuery += featureQueryList[i] + ")" else: featureQuery += featureQueryList[i] + " AND " print featureQuery productQuery = "" if productQueryList: productQuery = "(" for i in range(len(productQueryList)): if i == len(productQueryList) - 1: productQuery += productQueryList[i] + ")" else: productQuery += productQueryList[i] + " AND " return (featureQuery, productQuery, featureQueryList, productQueryList) except Exception, ex: print "Could not separate feature query words. Reason: ", ex return ("", "(" + query + ")", [], querywordlist)
def __init__(self, emoticon, searcher, analyzer, english_only=False): super(PMICalculator, self).__init__() self.field = "emoticons" self.emoticon = emoticon self.searcher = searcher self.analyzer = analyzer self.escaped_emoticon = QueryParser.escape(self.emoticon) self.query = QueryParser("emoticons", self.analyzer).parse(self.escaped_emoticon) self.raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/" if english_only: country = "United States" country_prefix = "US" else: country = None country_prefix = "" self.pmi_file_name = ( self.raw_stats_dir + normalizeEmoticonName(self.emoticon).rstrip("_") + ("_%s" % (country_prefix)) * english_only + ".pmidata" ) self.sample_tweets_name = ( self.raw_stats_dir + normalizeEmoticonName(self.emoticon).rstrip("_") + ("_%s" % (country_prefix)) * english_only + ".samptweets" ) self.sample_tweets_file = codecs.open(self.sample_tweets_name, encoding="utf-8", mode="w") self.term_count_collector = TermCountCollector(searcher, emoticon, country) print "starting query at: ", time.time() hits = self.searcher.search(self.query, self.term_count_collector) # print "terms: ", self.terms if emoticon == ":P": ee_two = QueryParser.escape(":p") elif emoticon == "T_T": ee_two = QueryParser.escape("TT") elif emoticon == "^_^": ee_two = QueryParser.escape("^^") if emoticon in [":P", "T_T", "^_^"]: q_two = QueryParser("emoticons", self.analyzer).parse(ee_two) hits_two = self.searcher.search(q_two, self.term_count_collector) self.terms = self.term_count_collector.getTerms() self.query_result_count = self.term_count_collector.getDocCount() for p_term, p_term_tweets in self.term_count_collector.popular_terms_hash.items(): for p_term_tweet in p_term_tweets: self.sample_tweets_file.write("term: " + p_term + " tweet: " + p_term_tweet + "\n") self.sample_tweets_file.close() self.base_stats_file = open( "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_pmi_stats.txt", "r" ) self.n = int(self.base_stats_file.read().strip().split(":")[1]) print "computing PMI for query: ", self.emoticon, " at: ", time.time() self.p_query_result = self.query_result_count * 1.0 / self.n
def testBasicQueryParser(self): analyzer = SimpleAnalyzer() query = QueryParser(Version.LUCENE_CURRENT, "description", analyzer).parse("partnum:Q36 AND SPACE") scoreDocs = self.searcher.search(query, 50).scoreDocs self.assertEqual("+partnum:q +space", query.toString("description"), "note Q36 -> q") self.assertEqual(0, len(scoreDocs), "doc not found :(")
def testPhraseQuery(self): analyzer = StandardAnalyzer(Version.LUCENE_24) q = QueryParser(Version.LUCENE_24, "field", analyzer).parse('"This is Some Phrase*"') self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed") q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"term"') self.assert_("TermQuery" == q.getClassName(), "reduced to TermQuery")
def getResultScoreDocs(query): # create analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # create parser for user submitted query parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) formatted_query = parser.parse(query) scoreDocs = searcher.search(formatted_query, 50).scoreDocs return scoreDocs
def testWithSlop(self): searcher = IndexSearcher(self.directory, True) parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.porterAnalyzer) parser.setPhraseSlop(1) query = parser.parse('"over the lazy"') topDocs = searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "hole accounted for")
def build_advanced_search_query(params, operator, analyzer): """ Takes a dictionary containing key=value pairs where keys are fields in our lucene document and values are search terms provided by the user. A BooleanQuery is built from these key=value pairs """ parser = QueryParser(Version.LUCENE_CURRENT, "name", analyzer) query_list = ["%s:\"%s\"" % (field, process_query_param(val)) for (field, val) in get_adv_query_packet(params)] return parser.parse("%s" % (" " + operator + " ").join(query_list))
def pesquisar_com_lucene(): initVM() #print 'lucene', VERSION # Get handle to index directory directory = SimpleFSDirectory(File(STORE_DIR)) # Creates a searcher searching the provided index. ireader = IndexReader.open(directory, True) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT) for query in querys: query_number = query.query_number # Constructs a query parser. We specify what field to search into. query.query_text = query.query_text.replace('?','') query.query_text = query.query_text.replace('*','') queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(query.query_text) # Run the query and get top 50 results topDocs = searcher.search(query,50000) # Get top hits scoreDocs = topDocs.scoreDocs r = resultado_query(query_number,scoreDocs) resultados.append(r) #print "%s total matching documents." % len(scoreDocs) #for scoreDoc in scoreDocs: # doc = searcher.doc(scoreDoc.doc) # print doc.get(FIELD_PATH) with open('resultados_da_busca/resultados.csv', 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in resultados: resultados_da_row = [] i = 1 for resultado_da_query in row.query_results: doc = searcher.doc(resultado_da_query.doc) resultados_da_row.append((i,int(doc.get(FIELD_PATH)))) i = i + 1 spamwriter.writerow([row.query_number,resultados_da_row])
def testDateRangeQuery(self): # locale diff between jre and gcj 1/1/04 -> 01/01/04 # expression = "modified:[1/1/04 TO 12/31/04]" expression = "modified:[01/01/04 TO 12/31/04]" parser = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer) parser.setLocale(Locale.US) query = parser.parse(expression) print expression, "parsed to", query topDocs = self.searcher.search(query, 50) self.assert_(topDocs.totalHits > 0)
def testTermRangeQuery(self): query = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer).parse("title2:[K TO N]") self.assert_(query.getClassName() == "TermRangeQuery") scoreDocs = self.searcher.search(query, 10).scoreDocs self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms") query = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer).parse("title2:{K TO Mindstorms}") scoreDocs = self.searcher.search(query, 10).scoreDocs self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms", True)
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command parsed_command = QueryParser.escape(command) query = QueryParser("text", analyzer).parse(parsed_command) hits = searcher.search(query) print "%s total matching documents." % hits.length() try: hctr = 0 for hit in hits: hit_id = hits.id(hctr), hit_tv = searcher.getIndexReader().getTermFreqVector(hits.id(hctr), "text") trm_str = "" for trm in hit_tv.getTerms(): trm_str += " " + trm print "term string: ", trm_str.encode("ascii","ignore") hctr += 1 if hctr > hits.length()-2 or hctr > 100: break print 'uid:', hit.get("user_id"), 'timestamp: ', hit.get("timestamp"), "country: ", hit.get('country'), "emoticons: ", hit.get('emoticons') except Exception, e: print "failed to list hit: ", e print command = raw_input("Query:") parsed_command = QueryParser.escape(command) print "Searching for emoticon:", parsed_command query = QueryParser("emoticons", analyzer).parse(parsed_command) hits = searcher.search(query) print "%s total matching documents." % hits.length() try: hctr = 0 for hit in hits: hit_id = hits.id(hctr), hit_tv = searcher.getIndexReader().getTermFreqVector(hits.id(hctr), "text") trm_str = "" for trm in hit_tv.getTerms(): trm_str += " " + trm print "term string: ", trm_str.encode("ascii","ignore") hctr += 1 if hctr > hits.length()-2 or hctr > 100: break print 'uid:', hit.get("user_id"), 'timestamp: ', hit.get("timestamp"), "country: ", hit.get('country'), "emoticons: ", hit.get('emoticons') except Exception, e: print "failed to list hit: ", e
def query(indexName, queryFile, runName): indReader = IndexReader.open(SimpleFSDirectory(File(indexName))) indSearcher = IndexSearcher(indReader) ir = indSearcher.getIndexReader() qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT)) f = open('results-'+runName, 'w') while(True): id = queryFile.readline() if id == "": break id = id.replace("C","") id = id.replace("\n","") queryString = queryFile.readline() queryString = queryString.replace("?","") queryString = queryString.replace("*","") queryString = queryString.replace("-","_") queryString = queryString.replace("\n","") query = qp.parse(queryString) queryFile.readline() returnedDocs = 1000 collector = TopScoreDocCollector.create(returnedDocs, True) indSearcher.search(query, collector) hits = collector.topDocs().scoreDocs size = len(hits) print "Total hits for query " +id+ ": "+str(size) i = 0 for hit in hits: docId = hits[i].doc score = hits[i].score doc = ir.document(docId) j = i + 1 f.write(id + " 0 " + doc.get('id') + " " + str(j) + " " + str(score) +" " + runName +"\n") i+=1 f.close()
def calculateEmoticonDiffusion(emoticon, searcher, analyzer, user_location_hash, usage_threshold = 1, comm_threshold = 1): raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/" emoticon_stats_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_diffusion_stats.txt","r") total_users = int(emoticon_stats_file.read().strip()) emoticon_stats_file.close() emoticon_file_name = raw_stats_dir + normalizeEmoticonName(emoticon).rstrip('_')+".diffusion_bidir" print "Calculating Diffusion for: ", emoticon, " at: ", time.time() escaped_emoticon = QueryParser.escape(emoticon) query = QueryParser("emoticons", analyzer).parse(escaped_emoticon) hits = searcher.search(query) print "%s total matching documents." % hits.length() if hits.length() == 0: return print "compiling diffusion stats at: ", time.time() emoticon_users_by_time_hash = {} emoticon_users_adopters_hash = {} emoticon_users_non_adopters_hash = {} users_exposure_hash = {} reverse_users_exposure_hash = {} try: hctr = 0 for hit in hits: hctr += 1 if hctr%100000==0: print "on hit: ", hctr #if hctr > 100000: break if hctr == hits.length(): break uid, timestamp, country, emoticons, user_id_replied = hit.get("user_id"), int(hit.get("timestamp")), hit.get('country'), hit.get('emoticons'), hit.get('user_id_replied') emoticon_users_by_time_hash[uid] = emoticon_users_by_time_hash.get(uid,[])+[timestamp] except Exception, e: pass
def boolean_search_lucene_index(index_dir, query_text, limit): ''' This function searches a boolean query in the learned lucene index Arguments: index_dir - the lucene index directory query_text - the query text which follows http://lucene.apache.org/core/3_6_0/queryparsersyntax.html limit - the number of records to be retrieved Return: rows - the returned document details ''' DEFAULT_QUERY_FIELD = 'all' store = SimpleFSDirectory(File(index_dir)) searcher = IndexSearcher(store, True) parser = QueryParser(Version.LUCENE_CURRENT, DEFAULT_QUERY_FIELD, STD_ANALYZER) query = parser.parse(query_text) start = datetime.datetime.now() scoreDocs = searcher.search(query, limit).scoreDocs duration = datetime.datetime.now() - start # print "Lucene Search: Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) rows = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) row = [] metadata = MetadataType._types for field in metadata: if table.get(field,'empty') != 'empty' : row.append(table.get(field,'empty')) else: row.append('') row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file row.append(scoreDoc.score) rows.append(row) return rows
def testAnalyzer(self): analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryString = "category:/philosophy/eastern" query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(queryString) self.assertEqual("category:\"philosophy eastern\"", query.toString("contents"), "path got split, yikes!") perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer) perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer()) query = QueryParser(Version.LUCENE_CURRENT, "contents", perFieldAnalyzer).parse(queryString) self.assertEqual("category:/philosophy/eastern", query.toString("contents"), "leave category field alone")
def testLowercasing(self): q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse("PrefixQuery*") self.assertEqual("prefixquery*", q.toString("field"), "lowercased") qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer) qp.setLowercaseExpandedTerms(False) q = qp.parse("PrefixQuery*") self.assertEqual("PrefixQuery*", q.toString("field"), "not lowercased")
def testSlop(self): q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"exact phrase"') self.assertEqual('"exact phrase"', q.toString("field"), "zero slop") qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer) qp.setPhraseSlop(5) q = qp.parse('"sloppy phrase"') self.assertEqual('"sloppy phrase"~5', q.toString("field"), "sloppy, implicitly")
def run(searcher, analyzer, command): # print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 10).scoreDocs # print "%s total matching documents." % len(scoreDocs) rankedfiles = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) # print 'path:', doc.get("path"), 'name:', doc.get("name") rankedfiles.append(int(doc.get("name"))) return rankedfiles
class IndexSearcherWrapper(object): def __init__(self, location): lucene.initVM() directory = SimpleFSDirectory(File(location)) self.reader = IndexReader.open(directory, True) self.searcher = IndexSearcher(self.reader) self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text", WhitespaceAnalyzer()) def search(self, topic, max=5000): query = self.query_parser.parse(topic.title) return self.searcher.search(query, max)
def LatestSearch(self): try: searcher = IndexSearcher(self.indexDir) today = time.strftime('%Y%m%d') keyWord = today.encode('utf8') print keyWord query = QueryParser(Version.LUCENE_30, "regDate", self.analyzer).parse(keyWord) hits = searcher.search(query, 1000) return self.__MakeResultFormat(hits, searcher) except: print 'BookSearcher TotalSearch Exception'
def search_image(command): if command == ' ': return [] Docs = [] vm_env = getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_img" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = {} allowed_opt = ['site'] opt = 'contents' for i in command.split(' '): if ':' in i: opt, value = i.split(':')[:2] opt = opt.lower() if opt in allowed_opt and value != '': command_dict[opt] = command_dict.get(opt, '') + ' ' + value else: seg_list = jieba.cut(i) command_dict[opt] = command_dict.get(opt, '') + ' ' + " ".join(seg_list) querys = BooleanQuery() for k, v in command_dict.iteritems(): if k == 'site': t = Term(k, '*' + v) query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10000).scoreDocs formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>") highlighter = Highlighter(formatter, QueryScorer(querys)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dic = {} doc_dic["url"] = doc.get("url") doc_dic["imgurl"] = doc.get("imgurl") doc_dic["urltitle"] = doc.get("urltitle") text = doc.get("contents") ts = analyzer.tokenStream(doc.get("contents"), StringReader(text)) doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...") Docs.append(doc_dic) searcher.close() return Docs
def run(searcher, analyzer, input, filepath): #input = raw_input("Query:").decode('gbk').encode('utf8') #print "Search for: " + input command = convert(input.decode('gbk').encode('utf8')) print "Search for:" + command.decode('utf8').encode('gbk') qp = QueryParser(Version.LUCENE_CURRENT, "sentence", analyzer) #qp.setPhraseSlop(0) query = qp.parse(command) scoreDocs = searcher.search(query, 1000000).scoreDocs print "%s total matching documents." % len(scoreDocs) print try: #filepath = "D:\\TotalCode\\PyluceneSample\\Output_pylucene.txt" filew = open(filepath, 'w') result_num = 0 for scoreDoc in scoreDocs: try: result_num += 1 if result_num % 1000 == 0: # time.sleep(5) print "Search added " + str(result_num) + " sentences..." #print 'scoreDoc.doc:', scoreDoc.doc doc = searcher.doc(scoreDoc.doc) path = doc.get("path") #print "path:" + path #print 'name:', doc.get("name") #print 'sentence_num:', str(doc.get("sentence_num")) #print 'sentence:', doc.get("sentence") #sentence = GetSentence(doc.get("sentence_num"), path) sentence = doc.get("sentence") #print 'sentence:', sentence OutputSentence(filew, doc.get("name"), sentence) except: continue filew.close() except: #Exception, e: print "Failed in Outputsentence:"#, e
def query(query): lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)] doc = searcher.doc(hit.doc)
def search(cls, indexDir, q): fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(fsDir, True) query = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q) start = time() hits = searcher.search(query, 50).scoreDocs duration = timedelta(seconds=time() - start) print "Found %d document(s) (in %s) that matched query '%s':" % ( len(hits), duration, q) for hit in hits: doc = searcher.doc(hit.doc) print 'path:', doc.get("path")
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get("name")
def testAnalyzer(self): analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryString = "category:/philosophy/eastern" parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer) parser.setAutoGeneratePhraseQueries(True) query = parser.parse(queryString) self.assertEqual("category:\"philosophy eastern\"", query.toString("contents"), "path got split, yikes!") perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer) perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer()) query = QueryParser(Version.LUCENE_CURRENT, "contents", perFieldAnalyzer).parse(queryString) self.assertEqual("category:/philosophy/eastern", query.toString("contents"), "leave category field alone")
def find(self, query, indir): lucene.initVM() INDEXDIR = indir indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query = QueryParser(Version.LUCENE_30,"<default field>",\ lucene_analyzer).parse("text:" + query + " OR title:" + query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "\nHits: ", total_hits.totalHits, "\n" for hit in total_hits.scoreDocs: print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc( hit.doc).get("department").encode( "utf-8"), "Title:", lucene_searcher.doc( hit.doc).get("title").encode("utf-8") print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'
def run(searcher, analyzer, querystr): query = QueryParser(lucene.Version.LUCENE_CURRENT, "contents", analyzer).parse(querystr) MAX = 1000 hits = searcher.search(query, MAX) results = [] for sd in hits.scoreDocs: doc = searcher.doc(sd.doc) results.append([ doc.get("name"), doc.get("owner").encode('gbk'), doc.get("title").encode('gbk') ]) # sort result results.sort(lambda x, y: cmp(x[0], y[0])) for name, owner, title in results: print name, owner, title
def run(searcher, analyzer, command, prior): if command == '': return store = [] command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 500000).scoreDocs scored = [] for scoreDoc in scoreDocs: returnfile = [] doc = searcher.doc(scoreDoc.doc) if doc.get("qst_num") in scored: continue if not doc.get("qst_name"): continue scored.append(doc.get("qst_num")) name = doc.get("qst_name").replace(' ', '') returnfile.append(name) detail = doc.get("qst_detail").replace(' ', '') returnfile.append(detail) returnfile.append(doc.get("qst_topic_accu")) returnfile.append(int(doc.get("qst_browse"))) returnfile.append(int(doc.get("qst_follow"))) returnfile.append(int(doc.get("qst_ans"))) returnfile.append(int(doc.get("qst_num"))) store.append(returnfile) store = storesort(store, prior) return store
def luceneRetriver(query): #print ('-------------Searching-------------') #print (query) lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query = QueryParser(Version.LUCENE_30, 'text', lucene_analyzer).parse(query) MAX = 1000 #存放返回的文档标题list title_list = [] total_hits = lucene_searcher.search(my_query, MAX) #print "Hits: ", total_hits.totalHits for hit in total_hits.scoreDocs[:10]: #print"Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString() doc = lucene_searcher.doc(hit.doc) #print doc.get("title").encode("utf-8").lstrip(str(TXTDIR)) #print doc.get("text").encode("utf-8") #print ('\n') title_list.append({ doc.get("title").encode("utf-8").lstrip(str(TXTDIR)): round(hit.score, 5) }.copy()) return title_list #print ('查询内容:八卦') #print ('查询结果:') #print ('\n') #luceneRetriver("下列 关于 中国 八卦 不正确 人类 历史 东西方 平等 交流 见证")
def find(self, phrase): phrase = phrase.lower().encode('utf8') query = ' '.join(['+' + word for word in phrase.split(' ')]) query = QueryParser(self._lversion, 'contents', self._analyzer).parse(query) hits = self._searcher.search(query, self.max_candidates) # if not hits.totalHits: print "%d documents for '%s'" % (hits.totalHits, str(query)) # potential bug # todo put article_id in lucene index instead of translating document title links = {} for hit in hits.scoreDocs: title = quote( self._searcher.doc( hit.doc).get("title").encode('utf-8').replace( ' ', '_')).replace('%28', '(').replace('%29', ')') if title in self._translation: links[self._translation[title]] = hit.score # else: print title # potential bug return self._links[phrase].get(-1, 0), links
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'GBK') if command == '': return command_dict = parseCommand(command) sep_command = " ".join(jieba.cut(command_dict['contents'])) command_dict['contents'] = sep_command #print command_dict if not command_dict.has_key('site'): command = command_dict['contents'] else: command = command_dict['contents'] + " site:" + command_dict['site'] print print "Searching for:", command querys = BooleanQuery() for k, v in command_dict.iteritems(): print k, v query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print "------------------------" #print 'site:', doc.get("site") print 'path:', doc.get("path") print 'title:', doc.get("title") print 'url:', doc.get("url") print 'name:', doc.get("name")
def main(cls, argv): if len(argv) != 3: print "Usage: Explainer <index dir> <query>" else: indexDir = argv[1] queryExpression = argv[2] directory = SimpleFSDirectory(File(indexDir)) query = QueryParser(Version.LUCENE_CURRENT, "contents", SimpleAnalyzer()).parse(queryExpression) print "Query:", queryExpression searcher = IndexSearcher(directory) scoreDocs = searcher.search(query, 50).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) explanation = searcher.explain(query, scoreDoc.doc) print "----------" print doc["title"].encode('utf-8') print explanation
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") #输入查询语句 command = unicode(command, 'GBK') #将查询语句转化为Unicode(注意创建索引时文件也是Unicode) #在Python IDLE下为GBK,在PyScripter-Portable中为UTF8(见Q.ppt) if command == '': return print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) #用analyzer来对查询语句进行词法分析和语言处理。 #QueryParser调用parser进行语法分析,形成查询语法树,放到Query中。 scoreDocs = searcher.search(query, 50).scoreDocs #IndexSearcher调用search对查询语法树Query进行搜索,得到结果 print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get("name")
def run(searcher, analyzer, command): while True: if command == '': return command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 300).scoreDocs print "%s total matching documents." % len(scoreDocs) text = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) temptext = [ doc.get("url"), doc.get('title'), doc.get("imgurl"), doc.get("price"), doc.get("kind") ] text.append(temptext) return text
writer.addDocument(doc) print("Indexed lines from stdin (%d documents in index)" % (writer.numDocs())) print("About to optimize index of %d documents..." % writer.numDocs()) writer.optimize() print("...done optimizing index of %d documents" % writer.numDocs()) print("Closing index of %d documents..." % writer.numDocs()) print("...done closing index of %d documents" % writer.numDocs()) writer.close() # RETRIEVAL dir = SimpleFSDirectory(File(fullIndexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) searcher = IndexSearcher(dir) query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(u"¿Dónde está La Mancha?") MAX = 1000 hits = searcher.search(query, MAX) print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) for hit in hits.scoreDocs: print(hit.score, hit.doc, hit.toString()) doc = searcher.doc(hit.doc) print(doc.get("text").encode("utf-8")) print(doc.get("metadata").encode("utf-8"))
f.write(string) f.write(item) config = ConfigParser() config.read('config.ini') r_server = Redis('localhost') lst = [] search = str(sys.argv[1]) if __name__ == "__main__": lucene.initVM() indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(search) MAX = 1000 hits = searcher.search(query, MAX) #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: if hit.score >= 0.0: #print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) #print doc.get("text").encode("utf-8") items = doc.get("text").encode("utf-8").split(config.get('counting','delimiter')) for item in items: if item == search: pass elif item not in lst:
def post(self): q = self.get_argument("query") k = self.get_argument("kTerms") # self.write(key) # def query(query): # query = self.get_argument("q") lucene.initVM() indexDir = "index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 10 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % ( hits.totalHits, query) items = [] rQ = [] #for key, value in doc_urls.iteritems() # print (key, value) for hit in hits.scoreDocs: #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]}) print hit.score, hit.doc, hit.toString() print(len(doc_urls)) items.append(doc_urls[str(hit.doc)]) print(doc_urls[str(hit.doc)]) doc = searcher.doc(hit.doc) print(hit.doc) rQ.append("html_files/" + str(hit.doc)) i = 0 rqSize = 0 for url in rQ: rqSize = rqSize + 1 print(url) f = codecs.open(url, 'r') html = f.read() html = html.decode('utf-8') tag_free = strip_tags(html) path = 'strippedHTML_files' if not os.path.exists(path): os.makedirs(path) filename = str(i) with open(os.path.join(path, filename), 'wb') as temp_file: temp_file.write(tag_free.encode('utf-8')) i = i + 1 path = 'strippedHTML_files' i = 0 for filename in os.listdir(path): with open(os.path.join(path, filename), 'r') as myfile: data = myfile.read() stripStopWords(data, i) i = i + 1 if k > 0: newQuery = calcNewQuery(k, q, rqSize) q = newQuery print("new query is ") print(q) self.render("index.html", title="Results", items=items, query=q, kTerms=k)
format = a elif o == "--index": indexDir = a elif o == "--stats": stats = True class CustomTemplate(Template): delimiter = '#' template = CustomTemplate(format) fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(fsDir, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(' '.join(args)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start if stats: print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) print template.substitute(table)
if __name__ == "__main__": usedsentences = numpy.zeros((BLOOM_FILTER_SIZE, ), dtype=numpy.bool) print >> sys.stderr, "Just created bloom filter with %d entries" % usedsentences.shape[ 0] print >> sys.stderr, stats() lucene.initVM() # create an index called 'index-dir' in a temp directory # indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), # 'index-dir') # indexDir = "/Tmp/REMOVEME.index-dir" indexDir = "lucene.ukwac" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) queryparser = QueryParser(Version.LUCENE_30, "text", analyzer) searcher = IndexSearcher(dir) nonzeros = 0 for i, l in enumerate(sys.stdin): if i % 100 == 0: print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % ( i, percent(nonzeros, BLOOM_FILTER_SIZE)) print >> sys.stderr, stats() l = string.strip(l) added_this_sentence = 0 for newl in retrieve(l, searcher, queryparser): # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT:
def searchDocuments(self, view, version, query=None, attribute=None): store = self.store if query is None: query = MatchAllDocsQuery() else: query = QueryParser("contents", StandardAnalyzer()).parse(query) if attribute: combinedQuery = BooleanQuery() combinedQuery.add(query, BooleanClause.Occur.MUST) combinedQuery.add(TermQuery(Term("attribute", attribute.str64())), BooleanClause.Occur.MUST) query = combinedQuery class _collector(PythonHitCollector): def __init__(_self): super(_collector, _self).__init__() _self.hits = [] def collect(_self, id, score): _self.hits.append((-score, id)) class _iterator(object): def __init__(_self): _self.txnStatus = 0 _self.searcher = None _self.collector = None def __del__(_self): try: if _self.searcher is not None: _self.searcher.close() store.abortTransaction(view, _self.txnStatus) except: store.repository.logger.exception("in __del__") _self.txnStatus = 0 _self.searcher = None _self.collector = None def __iter__(_self): _self.txnStatus = store.startTransaction(view) _self.searcher = searcher = self.getIndexSearcher() _self.collector = _collector() searcher.search(query, _self.collector) hits = _self.collector.hits if hits: heapify(hits) while hits: score, id = heappop(hits) doc = searcher.doc(id) uItem = UUID(doc['item']) if long(doc['version']) <= version: if store._items.isValue(view, version, uItem, UUID(doc['value'])): yield uItem, UUID(doc['attribute']) return _iterator()
def testBoost(self): q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse("term^2") self.assertEqual("term^2.0", q.toString("field"))
'robespierre', 'danton', 'xvi' ] #, 'marat', 'mirabeau', 'antoinette', 'fayette', 'tyran']#, 'égalité'.decode('utf-8'), 'fraternité'.decode('utf-8'), 'révolution'.decode('utf-8'), 'salut', 'necker', 'napoleon', 'monarchie', 'aristocratie', 'hébert'.decode('utf-8'), 'gironde', 'jacobins', 'feuillants', 'royalistes','royaliste', 'guillotine', 'bastille', 'versailles', 'tuilleries', 'paume', 'constitution', 'etats', 'citoyen', 'democratie'] initVM() # Get handle to index directory directory = SimpleFSDirectory(File(STORE_DIR)) # Creates a searcher searching the provided index. ireader = IndexReader.open(directory, True) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. We specify what field to search into. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) appearance_dict = {} for TERM in term_list: print 'Searching for: "' + TERM + '"' # Create the query query = queryParser.parse(TERM) # Run the query and get documents that contain the term docs_containing_term = searcher.search(query, ireader.numDocs()) docs = [] print 'Found ' + str(len(docs_containing_term.scoreDocs) ) + ' documents with the term "' + TERM + '".' #hits = searcher.search(query, 1)
# Get handle to index directory directory = SimpleFSDirectory(File(STORE_DIR)) # Creates a searcher searching the provided index. ireader = IndexReader.open(directory, True) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create a query query = queryParser.parse(QUERY_STRING) topDocs = searcher.search(query, 50) # Get top hits scoreDocs = topDocs.scoreDocs print "%s total matching documents." % len(scoreDocs) HighlightFormatter = SimpleHTMLFormatter() query_score = QueryScorer (query) highlighter = Highlighter(HighlightFormatter, query_score)
def process_query_param(param): """ Escapes and lowercases all query params for searching in the lucene index. """ processed_param = QueryParser.escape(param) return processed_param.lower()
def main(indexDir, inputDir): """Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index""" lucene.initVM() # Open index logger.info("Opening Lucene index [%s]..." % indexDir) dir = SimpleFSDirectory(File(indexDir)) analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT) searcher = IndexSearcher(dir) # Search documents onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ] for f in onlyfiles: json_data = open(inputDir + '/' + f) data = json.load(json_data) # The results collected after comparison results = list() journal_code = f.split('.')[0] for entry in data: url = entry['url'] date = entry['date'] title = entry['title'] logger.debug("Processing URL [%s] date [%s] - [%s]" % (url, date, title)) tt = nltk.word_tokenize(title) tokens = [] for t in tt: tokens.append(t.lower()) entry['similars'] = list() for token in tokens: q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % ( token, date, journal_code, url) query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(q) hits = searcher.search(query, MAX_HITS) logger.debug("Found %d document(s) that matched query '%s':" % (hits.totalHits, q)) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) logger.debug(doc) entry['similars'].append({ 'token': token, 'url': doc.get('url'), 'title': doc.get('title') }) results.append(entry) json_data.close() print """<html> <body> <table><thead> <tr> <th>Jornal</th><th>Data</th><th>Título</th><th>URL</th><th>Notícias semelhantes</th> </tr> </thead> <tbody> """ for entry in results: similars = entry['similars'] similars_text = '<ul>' for s in similars: similars_text += '<li>[%s] [%s] [%s]</li>' % ( s['token'].encode('iso-8859-1', 'ignore'), s['title'].encode('iso-8859-1', 'ignore'), s['url'].encode( 'iso-8859-1', 'ignore')) similars_text += '</ul>' print """<tr> <td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td> </tr> """ % (journal_code, entry['date'].encode('iso-8859-1', 'ignore'), entry['title'].encode('iso-8859-1', 'ignore'), entry['url'].encode( 'iso-8859-1', 'ignore'), similars_text)