def __init__(self, user_loc_string, debug=False): analyzer = StopAnalyzer() fields = ['name', 'alternate_names', 'state'] directory = FSDirectory.getDirectory("index") self.DEBUG = debug self.searcher = IndexSearcher(directory) self.parser = MultiFieldQueryParser(fields, analyzer) self.user_location = self.doSearch(user_loc_string)
def search_lucene_index(search_params, index_dir, index_metadata, records_per_page): """ Uses the query term provided to search the disease ontology lucene index """ results = [] index_dir = SimpleFSDirectory(File(index_dir)) analyzer = build_perfield_analyzer(index_metadata) searcher = IndexSearcher(index_dir) index_fields = index_metadata.keys() # Since we are paging results we want to grab what page we are on page = (int(search_params.get('page', 1))) - 1 # Doing something pretty hacky here since we are trying to move from 0-based to 1 # based indexing to match our pagingation display offset = int(page) * records_per_page # If we are executing an advanced search we will be building a BooleanQuery # in parts as opposed to the one MultiFieldQueryParser when doing a basic # serach query = None if search_params.get('adv_search') == "True": query = build_advanced_search_query(search_params, search_params.get('operator'), analyzer) else: parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields, analyzer) query = MultiFieldQueryParser.parse( parser, process_query_param((search_params.get('q')))) # Parse through our hits hits = searcher.search(query, 10000) total_hits = hits.totalHits count = min(hits.totalHits - offset, records_per_page) for i in xrange(0, count): score_doc = hits.scoreDocs[offset + i] doc = searcher.doc(score_doc.doc) term_id = doc.get('term id') name = doc.get('name') explain = searcher.explain(query, score_doc.doc) match_fields = get_field_matches(explain.toString(), index_fields) results.append((term_id, name, list(match_fields))) searcher.close() return (results, total_hits)
def __init__(self, user_loc_string, debug=False): analyzer = StopAnalyzer() fields = ["name", "alternate_names", "state"] directory = FSDirectory.getDirectory("index") self.DEBUG = debug self.searcher = IndexSearcher(directory) self.parser = MultiFieldQueryParser(fields, analyzer) self.user_location = self.doSearch(user_loc_string)
def search_lucene_index(index_dir, query_model, limit): ''' This function searches query model (query terms along with their meta data) in the learned lucene index Arguments: index_dir - the lucene index directory query_model - the query model (contains query terms, meta data, and conjunctions) limit - the number of records to be retrieved Return: rows - the returned document details ''' store = SimpleFSDirectory(File(index_dir)) searcher = IndexSearcher(store, True) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, query_model[1], STD_ANALYZER) query = parser.parse(Version.LUCENE_CURRENT, query_model[0], query_model[1], query_model[2], STD_ANALYZER) scoreDocs = searcher.search(query, limit).scoreDocs print "Found %d document(s) that matched query '%s':" %(len(scoreDocs), query) rows = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) row = [] metadata = MetadataType._types for field in metadata: if table.get(field,'empty') != 'empty' : row.append(table.get(field,'empty')) else: row.append('') row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file row.append(scoreDoc.score) rows.append(row) return rows
def testSpecifiedOperator(self): MUST = BooleanClause.Occur.MUST query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, "development", ["title", "subject"], [MUST, MUST], SimpleAnalyzer()) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(query, 50).scoreDocs self.assertHitsIncludeTitle(searcher, scoreDocs, "Java Development with Ant") self.assertEqual(1, len(scoreDocs), "one and only one")
def search_lucene_index(search_params, index_dir, index_metadata, records_per_page): """ Uses the query term provided to search the disease ontology lucene index """ results = [] index_dir = SimpleFSDirectory(File(index_dir)) analyzer = build_perfield_analyzer(index_metadata) searcher = IndexSearcher(index_dir) index_fields = index_metadata.keys() # Since we are paging results we want to grab what page we are on page = (int(search_params.get('page', 1))) - 1 # Doing something pretty hacky here since we are trying to move from 0-based to 1 # based indexing to match our pagingation display offset = int(page) * records_per_page # If we are executing an advanced search we will be building a BooleanQuery # in parts as opposed to the one MultiFieldQueryParser when doing a basic # serach query = None if search_params.get('adv_search') == "True": query = build_advanced_search_query(search_params, search_params.get('operator'), analyzer) else: parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields, analyzer) query = MultiFieldQueryParser.parse(parser, process_query_param((search_params.get('q')))) # Parse through our hits hits = searcher.search(query, 10000) total_hits = hits.totalHits count = min(hits.totalHits - offset, records_per_page) for i in xrange(0, count): score_doc = hits.scoreDocs[offset+i] doc = searcher.doc(score_doc.doc) term_id = doc.get('term id') name = doc.get('name') explain = searcher.explain(query, score_doc.doc) match_fields = get_field_matches( explain.toString(), index_fields ) results.append( (term_id, name, list(match_fields)) ) searcher.close() return (results, total_hits)
def testDefaultOperator(self): SHOULD = BooleanClause.Occur.SHOULD query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, "development", ["title", "subject"], [SHOULD, SHOULD], SimpleAnalyzer()) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(query, 50).scoreDocs self.assertHitsIncludeTitle(searcher, scoreDocs, "Java Development with Ant") # has "development" in the subject field self.assertHitsIncludeTitle(searcher, scoreDocs, "Extreme Programming Explained")
def find(terms): """Use the Lucene index to find monsters""" terms = ' '.join(terms) searcher = IndexSearcher(STORE) SHOULD = BooleanClause.Occur.SHOULD query = MultiFieldQueryParser.parse(terms, ['name_', 'full_text'], [SHOULD, SHOULD], StandardAnalyzer()) hits = searcher.search(query) ret = [] for i, hit in enumerate(hits): doc = Hit.cast_(hit).getDocument() ret.append(MyHit(doc, hits, i)) if i == 10: break return ret
class QuestionAardvark(object): prepositions = ['aboard', 'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'anti', 'around', 'as', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'but', 'by', 'concerning', 'considering', 'despite', 'down', 'except', 'excepting', 'excluding', 'following', 'for', 'from', 'in', 'inside', 'into', 'like', 'near', 'of', 'off', 'on', 'onto', 'opposite', 'outside', 'over', 'past', 'regarding', 'round', 'save', 'since', 'than', 'through', 'to', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'via', 'with', 'within', 'without'] # pronouns = ['the', 'this', 'that', 'my', 'mine', 'yours', 'his', 'hers', 'its', 'our', 'their'] # want preposition list in regex of form: '(alpha|beta)(?!.*(alpha|beta)).*' prep_phrase = re.compile(r'\b(' + '|'.join(prepositions) + r')\b' + r'((?!.+\b(' + '|'.join(prepositions) + r')\b).*)') # identifies phrases referring to the user's current location current_location = re.compile(r'\b(here|this)\b') def __init__(self, user_loc_string, debug=False): analyzer = StopAnalyzer() fields = ['name', 'alternate_names', 'state'] directory = FSDirectory.getDirectory("index") self.DEBUG = debug self.searcher = IndexSearcher(directory) self.parser = MultiFieldQueryParser(fields, analyzer) self.user_location = self.doSearch(user_loc_string) def run(self): """ Loops indefinitely and accepts questions from the user. Identifies prepositional phrases and returns corresponding locations, sorted by population, to the user. """ while True: question= raw_input("\nEnter your Aardvark question: ") if question == '': return # logic here to feed into doSearch # find all prepositional phrases question = question.translate(string.maketrans("",""), string.punctuation) locations = [] match = self.prep_phrase.search(question) while match is not None: if self.DEBUG: print "Found: ", match.group() doc = self.doSearch(match.group(2)) if doc is not None: if self.DEBUG: print doc.get('name') locations.append(doc) else: cur_loc_ref = self.current_location.search(question) if cur_loc_ref is not None and self.user_location is not None: locations.append(self.user_location) question = question[:match.start()] match = self.prep_phrase.search(question) # sorting locations, one ID'd per prepositional phrase locations.sort(compare) print [doc.get('name').title() \ + ', ' + doc.get('state').upper() \ + ': ' + doc.get('population') for doc in locations] def doSearch(self, string): """ Does the actual interacting with Lucene """ try: query = self.parser.parse(self.parser, string) hits = self.searcher.search(query) if hits.length() > 0: return hits[0] return None except JavaError: return None
class QuestionAardvark(object): prepositions = [ "aboard", "about", "above", "across", "after", "against", "along", "amid", "among", "anti", "around", "as", "at", "before", "behind", "below", "beneath", "beside", "besides", "between", "beyond", "but", "by", "concerning", "considering", "despite", "down", "except", "excepting", "excluding", "following", "for", "from", "in", "inside", "into", "like", "near", "of", "off", "on", "onto", "opposite", "outside", "over", "past", "regarding", "round", "save", "since", "than", "through", "to", "toward", "towards", "under", "underneath", "unlike", "until", "up", "upon", "via", "with", "within", "without", ] # pronouns = ['the', 'this', 'that', 'my', 'mine', 'yours', 'his', 'hers', 'its', 'our', 'their'] # want preposition list in regex of form: '(alpha|beta)(?!.*(alpha|beta)).*' prep_phrase = re.compile( r"\b(" + "|".join(prepositions) + r")\b" + r"((?!.+\b(" + "|".join(prepositions) + r")\b).*)" ) # identifies phrases referring to the user's current location current_location = re.compile(r"\b(here|this)\b") def __init__(self, user_loc_string, debug=False): analyzer = StopAnalyzer() fields = ["name", "alternate_names", "state"] directory = FSDirectory.getDirectory("index") self.DEBUG = debug self.searcher = IndexSearcher(directory) self.parser = MultiFieldQueryParser(fields, analyzer) self.user_location = self.doSearch(user_loc_string) def run(self): """ Loops indefinitely and accepts questions from the user. Identifies prepositional phrases and returns corresponding locations, sorted by population, to the user. """ while True: question = raw_input("\nEnter your Aardvark question: ") if question == "": return # logic here to feed into doSearch # find all prepositional phrases question = question.translate(string.maketrans("", ""), string.punctuation) locations = [] match = self.prep_phrase.search(question) while match is not None: if self.DEBUG: print "Found: ", match.group() doc = self.doSearch(match.group(2)) if doc is not None: if self.DEBUG: print doc.get("name") locations.append(doc) else: cur_loc_ref = self.current_location.search(question) if cur_loc_ref is not None and self.user_location is not None: locations.append(self.user_location) question = question[: match.start()] match = self.prep_phrase.search(question) # sorting locations, one ID'd per prepositional phrase locations.sort(compare) print [ doc.get("name").title() + ", " + doc.get("state").upper() + ": " + doc.get("population") for doc in locations ] def doSearch(self, string): """ Does the actual interacting with Lucene """ try: query = self.parser.parse(self.parser, string) hits = self.searcher.search(query) if hits.length() > 0: return hits[0] return None except JavaError: return None