def searchForDbpediaURI(self, uri): """ Returns all sentences, which are tagged with the given DBpedia URI """ print "in searchForDbpediaURI" uri_old = uri uri = uri.replace("http://dbpedia.org/ontology/","") uri = uri.replace("http://dbpedia.org/property/","") uri = uri.replace("http://dbpedia.org/resource/","") array = re.findall(r'[\w\s]+',uri) uri = "" for item in array: uri+=item try: qp = QueryParser(Version.LUCENE_35, "URI", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(uri) print "query: "+str(query) MAX = 500000 result = [] hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) dbpedia_uri = doc["URI"] if dbpedia_uri == uri_old: result.append([IndexUtils.sentence_wrapper(doc["Sentence"]), doc["X"], doc["Y"],dbpedia_uri]) return result except: print("Fail in uri: "+uri) print "Unexpected error:", sys.exc_info()[0] return result
def searchXYPair(self,x,y): """ Returns all sentences, which are tagged with the given two entities (x,y) """ tmp_hm = {} if x == "" or y == "": return [] try: array = re.findall(r'[\w\s]+',x) x = "" for item in array: x+=item qp = QueryParser(Version.LUCENE_35, "X", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(x) MAX = 100000 result_list = [] hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) y_entry = doc["Y"] if y_entry == y: tmp_hm[doc["Sentence"]]="" for key in tmp_hm: result_list.append(IndexUtils.sentence_wrapper(key)) tmp_hm = {} return result_list except: print("Fail (search XYPair) in x:"+x+" y:"+y) print "Unexpected error:", sys.exc_info()[0] print return []
def searchKey(self, key , rank = None): query = "" try: MAX = 100000 qp = QueryParser(Version.LUCENE_35, "key", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(key) # print ("query",query) hits = searcher.search(query, MAX) sentence_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) try: sentence_list.append(eval(doc.get("sentence").encode("utf-8"))) except: print doc.get("sentence") return sentence_list except: print("Fail in receiving sentence with term "+key) print ("query",query) print "Unexpected error:", sys.exc_info()[0] # raw_input("wait") print return []
def searchForDbpediaURI(self, uri): """ Returns all anchor texts, which are related to the given DBpedia URI. Also returns for each anchor text the corresponding URI and the number of how often the anchor appears on the english Wikipedia """ uri_old = uri uri = uri.replace("http://dbpedia.org/resource/","") array = re.findall(r'[\w\s]+',uri) uri = "" for item in array: uri+=item try: qp = QueryParser(Version.LUCENE_35, "dbpedia_uri", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(uri) MAX = 10000 result = [] hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) dbpedia_uri = doc["dbpedia_uri"].encode("utf-8") if dbpedia_uri == uri_old: result.append([doc["anchor"].encode("utf-8"), doc["anchor_uri"].encode("utf-8"), dbpedia_uri, doc["number"].encode("utf-8")]) return result except: print("searchForDbpediaURI - Fail in uri: "+uri) return []
def does_line_existNew(self,line,x,y): """ Checks, if parsed sentence already exists in index """ query = "" try: array = re.findall(r'[\w]+',line) string = "" for item in array: string+=item+" " qp = QueryParser(Version.LUCENE_35, "Sentence", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(string) MAX = 10 hits = searcher.search(query, MAX) if len(hits.scoreDocs)>0: return True else: return False except Exception: s_tmp = str(sys.exc_info()) if "too many boolean clauses" in s_tmp: print "too many boolean clauses" """ Returns true, so that the sentence is not added each time, to avoid further error messages. Only occours with very large sentences. """ return True else: print "Unexpected error:", sys.exc_info()[0] print "in does line exist" print s_tmp return False
def searchString(self, string): 'searches for a string and returns an array of POS-tagged sentences' query = "" #print("Input String: ",string) try: MAX = 100000 #for dates such as 1931.08.06 string = string.replace("."," ") array = re.findall(r'[\w\s]+',string) string = "" for item in array: string+=item #print("Input String2: ",string) qp = QueryParser(Version.LUCENE_35, "sentence", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(string) #print ("query",query) hits = searcher.search(query, MAX) #print len(hits) sentence_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) #print doc.get("sentence") sentence_list.append(eval(doc.get("sentence").encode("utf-8"))) return sentence_list except: print("Fail in receiving sentence with term "+string+" in search term") print ("query",query) print "Unexpected error:", sys.exc_info()[0] # raw_input("wait") print return []
def query(indexName, queryString): indSearcher = IndexSearcher(SimpleFSDirectory(File(indexName))) qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT)) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(queryString.replace("-","_")) aux = indSearcher.search(query, 100) results = aux.scoreDocs hits = aux.totalHits ir = indSearcher.getIndexReader() #results = collector.topDocs() i = 0 res = [] for r in results: doc = ir.document(i) res.insert(i, doc.get('id')) i+=1 return res
def search(self, string ,special = None): query = "" try: MAX = 100000 #for dates such as 1931.08.06 string = string.replace("."," ") array = re.findall(r'[\w\s]+',string) string = "" for item in array: string+=item qp = QueryParser(Version.LUCENE_35, "title", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(string) # print ("query",query) hits = searcher.search(query, MAX) sentence_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) sentence_list.append(doc.get("title").encode("utf-8")) return sentence_list except: print("Fail in receiving sentence with term "+string) print ("query",query) print "Unexpected error:", sys.exc_info()[0] # raw_input("wait") print return []
def getResultScoreDocs(query): # create analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # create parser for user submitted query parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) formatted_query = parser.parse(query) scoreDocs = searcher.search(formatted_query, 50).scoreDocs return scoreDocs
def does_line_exist(self,line,x,y): """ Old, more complex function if a sentence already exists in the index. Not used in the moment """ return self.does_line_existNew(line, x, y) try: array = re.findall(r'[\w\s]+',x) x = "" for item in array: x+=item qp = QueryParser(Version.LUCENE_35, "X", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(x) MAX = 100000 hits = searcher.search(query, MAX) #First check, if an x already exists for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) y_entry = doc["Y"] if y_entry == y: print "y found" print try: array = re.findall(r'[\w\s]+',line) string = "" for item in array: string+=item qp = QueryParser(Version.LUCENE_35, "Sentence", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(string) MAX = 10 hits = searcher.search(query, MAX) if len(hits.scoreDocs)>0: return True except Exception: s_tmp = str(sys.exc_info()) if "too many boolean clauses" in s_tmp: print "too many boolean clauses" return True else: print "Unexpected error:", sys.exc_info()[0] print "in does line exist" print s_tmp print 'nothing found' return False except: print("Fail (does line exists) in x:"+x+" y:"+y) print "Unexpected error:", sys.exc_info()[0] print
def run(command): if command == '': return None STORE_DIR = "index" initVM(CLASSPATH) directory = FSDirectory.getDirectory(STORE_DIR, False) searcher = IndexSearcher(directory) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) parser.setFuzzyMinSim(0.2) query = parser.parse(command) hits = map(transform, searcher.search(query)) searcher.close() return hits
def searchForDbpediaURImax(self, uri, number): """ Returns maximal the number of anchor texts, which are related to the given DBpedia URI. Also returns for each anchor text the corresponding URI and the number of how often the anchor appears on the English Wikipedia """ uri_old = uri uri = uri.replace("http://dbpedia.org/resource/","") array = re.findall(r'[\w\s]+',uri) uri = "" for item in array: uri+=item try: qp = QueryParser(Version.LUCENE_35, "dbpedia_uri", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(uri) MAX = 10000 result = [] hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) dbpedia_uri = doc["dbpedia_uri"].encode("utf-8") if dbpedia_uri == uri_old: result.append([doc["anchor"].encode("utf-8"), doc["anchor_uri"].encode("utf-8"), dbpedia_uri, int(doc["number"].encode("utf-8"))]) result = sorted(result, key = itemgetter(3), reverse=True) if len(result) > number: return result[0:number] else: return result return result except: print("searchForDbpediaURImax - Fail in uri: "+uri) print "Unexpected error:", sys.exc_info()[0] # raise print return []
def test_search(index_dir): ''' The test function to test the created index ''' store = SimpleFSDirectory(File(index_dir)) searcher = IndexSearcher(store, True) parser = QueryParser(Version.LUCENE_CURRENT, "keywords", STD_ANALYZER) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse('email_subject:Training') start = datetime.datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.datetime.now() - start print "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print scoreDoc.score table = dict((field.name(), field.stringValue()) for field in doc.getFields()) print table
def similar(command, docno): STORE_DIR = "index" initVM(CLASSPATH) directory = FSDirectory.getDirectory(STORE_DIR, False) searcher = IndexSearcher(directory) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) parser.setFuzzyMinSim(0.2) query = parser.parse(command) hits = searcher.search(query) document = hits.id(docno) ir = IndexReader.open(STORE_DIR) mlt = MoreLikeThis(ir) mlt.setFieldNames(['name', 'contents']) mlt.setMinWordLen(2) mlt.setBoost(True) query = mlt.like(document) hits = map(transform, searcher.search(query)) searcher.close() return hits
def search(self, query,category_id=None): SHOULD = BooleanClause.Occur.SHOULD #MultiFieldQueryParser.setOperator(QueryParser.DEFAULT_OPERATOR_AND); parser1 = QueryParser('summary',self.analyzer) parser2 = QueryParser('title',self.analyzer) parser1.setDefaultOperator(QueryParser.AND_OPERATOR) parser2.setDefaultOperator(QueryParser.AND_OPERATOR) q1 = parser1.parse(query) q2 = parser2.parse(query) boolQuery = BooleanQuery() boolQuery.add(q1,SHOULD) boolQuery.add(q2,SHOULD) #camp = CategoryComparatorSource(query) #sortfield = SortField("link", camp) #sort = Sort(sortfield) if category_id: self.catfilter.query = query self.catfilter.category_id = category_id hits = self.searcher.search(boolQuery,self.catfilter) else: hits = self.searcher.search(boolQuery) return hits
def searchForXY(self, uri): print "in searchForDbpediaURI" uri_old = uri uri = uri.replace("http://dbpedia.org/ontology/","") uri = uri.replace("http://dbpedia.org/property/","") uri = uri.replace("http://dbpedia.org/resource/","") array = re.findall(r'[\w\s]+',uri) uri = "" for item in array: uri+=item hm = {} try: qp = QueryParser(Version.LUCENE_35, "URI", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(uri) print "query: "+str(query) MAX = 500000 result = [] hits = searcher.search(query, MAX) #print len(hits) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) dbpedia_uri = doc["URI"].encode("utf-8") if dbpedia_uri == uri_old: x = doc["X"] y = doc["Y"] term = x+" "+y if hm.has_key(term): pass else: hm[term] = "" result.append([x,y]) return result except: print("Fail in uri: "+uri) return []
def run(searcher, parser): while True: print print "Hit enter with no input to quit." command = raw_input("Query: ") if command == '': return print "Searching for:", command query = parser.parse(command) hits = searcher.search(query, Sort("population", True)) print "%s total matching documents." % hits.length() for hit in hits: doc = Hit.cast_(hit).getDocument() print 'name:', doc.get("name"), ' state:', doc.get("state") if __name__ == '__main__': STORE_DIR = "index" initVM(CLASSPATH) print 'lucene', VERSION directory = FSDirectory.getDirectory(STORE_DIR) searcher = IndexSearcher(directory) analyzer = StopAnalyzer() parser = QueryParser("all_names", analyzer) parser.setDefaultOperator(parser.AND_OPERATOR) run(searcher, parser) searcher.close()
format = a elif o == "--index": indexDir = a elif o == "--stats": stats = True class CustomTemplate(Template): delimiter = '#' template = CustomTemplate(format) fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(fsDir, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(' '.join(args)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start if stats: print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) print template.substitute(table)
def search_by_field(self,query,field='summary'): parser = QueryParser(field,self.analyzer) parser.setDefaultOperator(QueryParser.AND_OPERATOR) q = parser.parse(query) return self.searcher.search(q)
if o == "--format": format = a elif o == "--index": indexDir = a elif o == "--stats": stats = True class CustomTemplate(Template): delimiter = '#' template = CustomTemplate(format) fsDir = SimpleFSDirectory(indexDir) searcher = IndexSearcher(fsDir, True) parser = QueryParser(Version.LUCENE_CURRENT, "keywords", StandardAnalyzer(Version.LUCENE_CURRENT)) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(' '.join(args)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start if stats: print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) print template.substitute(table)