Exemplo n.º 1
0
    def __init__(self, user_loc_string, debug=False):
        analyzer = StopAnalyzer()
        fields = ['name', 'alternate_names', 'state']
        directory = FSDirectory.getDirectory("index")

        self.DEBUG = debug
        self.searcher = IndexSearcher(directory)
        self.parser = MultiFieldQueryParser(fields, analyzer)
        self.user_location = self.doSearch(user_loc_string)
Exemplo n.º 2
0
def search_lucene_index(search_params, index_dir, index_metadata,
                        records_per_page):
    """
    Uses the query term provided to search the disease ontology lucene index
    """
    results = []

    index_dir = SimpleFSDirectory(File(index_dir))
    analyzer = build_perfield_analyzer(index_metadata)
    searcher = IndexSearcher(index_dir)
    index_fields = index_metadata.keys()

    # Since we are paging results we want to grab what page we are on
    page = (int(search_params.get('page', 1))) - 1

    # Doing something pretty hacky here since we are trying to move from 0-based to 1
    # based indexing to match our pagingation display
    offset = int(page) * records_per_page

    # If we are executing an advanced search we will be building a BooleanQuery
    # in parts as opposed to the one MultiFieldQueryParser when doing a basic
    # serach
    query = None

    if search_params.get('adv_search') == "True":
        query = build_advanced_search_query(search_params,
                                            search_params.get('operator'),
                                            analyzer)
    else:
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields,
                                       analyzer)
        query = MultiFieldQueryParser.parse(
            parser, process_query_param((search_params.get('q'))))

    # Parse through our hits
    hits = searcher.search(query, 10000)
    total_hits = hits.totalHits
    count = min(hits.totalHits - offset, records_per_page)

    for i in xrange(0, count):
        score_doc = hits.scoreDocs[offset + i]
        doc = searcher.doc(score_doc.doc)
        term_id = doc.get('term id')
        name = doc.get('name')
        explain = searcher.explain(query, score_doc.doc)
        match_fields = get_field_matches(explain.toString(), index_fields)

        results.append((term_id, name, list(match_fields)))

    searcher.close()
    return (results, total_hits)
    def __init__(self, user_loc_string, debug=False):
        analyzer = StopAnalyzer()
        fields = ["name", "alternate_names", "state"]
        directory = FSDirectory.getDirectory("index")

        self.DEBUG = debug
        self.searcher = IndexSearcher(directory)
        self.parser = MultiFieldQueryParser(fields, analyzer)
        self.user_location = self.doSearch(user_loc_string)
Exemplo n.º 4
0
def search_lucene_index(index_dir, query_model, limit):
    '''
    This function searches query model (query terms along with their 
    meta data) in the learned lucene index
    
    Arguments: 
        index_dir - the lucene index directory 
        query_model - the query model (contains query terms, meta data, and conjunctions) 
        limit - the number of records to be retrieved 
    Return: 
        rows - the returned document details 

    
    '''
    store = SimpleFSDirectory(File(index_dir))
    searcher = IndexSearcher(store, True)
    parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, query_model[1], STD_ANALYZER)
    query = parser.parse(Version.LUCENE_CURRENT, query_model[0], query_model[1], query_model[2], STD_ANALYZER)
    scoreDocs = searcher.search(query, limit).scoreDocs
    
    print "Found %d document(s) that matched query '%s':" %(len(scoreDocs), query)
    
    rows = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        table = dict((field.name(), field.stringValue())
                     for field in doc.getFields())
        row = []
        metadata = MetadataType._types
        for field in metadata:
            if table.get(field,'empty') != 'empty' :
                row.append(table.get(field,'empty'))
            else: 
                row.append('')
        row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file 
        row.append(scoreDoc.score)
        
        rows.append(row)
    
    return rows
    def testSpecifiedOperator(self):
        
        MUST = BooleanClause.Occur.MUST
        query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                            "development", ["title", "subject"],
                                            [MUST, MUST],
                                            SimpleAnalyzer())

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(query, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Java Development with Ant")
        self.assertEqual(1, len(scoreDocs), "one and only one")
Exemplo n.º 6
0
    def testSpecifiedOperator(self):

        MUST = BooleanClause.Occur.MUST
        query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                            "development",
                                            ["title", "subject"], [MUST, MUST],
                                            SimpleAnalyzer())

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(query, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Java Development with Ant")
        self.assertEqual(1, len(scoreDocs), "one and only one")
Exemplo n.º 7
0
def search_lucene_index(search_params, index_dir, index_metadata, records_per_page):
    """
    Uses the query term provided to search the disease ontology lucene index
    """
    results = []

    index_dir = SimpleFSDirectory(File(index_dir))
    analyzer = build_perfield_analyzer(index_metadata)
    searcher = IndexSearcher(index_dir)
    index_fields = index_metadata.keys()

    # Since we are paging results we want to grab what page we are on   
    page = (int(search_params.get('page', 1))) - 1

    # Doing something pretty hacky here since we are trying to move from 0-based to 1 
    # based indexing to match our pagingation display
    offset = int(page) * records_per_page

    # If we are executing an advanced search we will be building a BooleanQuery
    # in parts as opposed to the one MultiFieldQueryParser when doing a basic
    # serach
    query = None
    
    if search_params.get('adv_search') == "True":
        query = build_advanced_search_query(search_params, search_params.get('operator'), analyzer)
    else:
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields, analyzer)
        query = MultiFieldQueryParser.parse(parser, process_query_param((search_params.get('q'))))

    # Parse through our hits
    hits = searcher.search(query, 10000)
    total_hits = hits.totalHits
    count = min(hits.totalHits - offset, records_per_page)

    for i in xrange(0, count):
        score_doc = hits.scoreDocs[offset+i]
        doc = searcher.doc(score_doc.doc)
        term_id = doc.get('term id')
        name = doc.get('name')
        explain = searcher.explain(query, score_doc.doc)
        match_fields = get_field_matches( explain.toString(), index_fields )

        results.append( (term_id, name, list(match_fields)) )

    searcher.close()
    return (results, total_hits)
    def testDefaultOperator(self):

        SHOULD = BooleanClause.Occur.SHOULD
        query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                            "development", ["title", "subject"],
                                            [SHOULD, SHOULD],
                                            SimpleAnalyzer())

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(query, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Java Development with Ant")

        # has "development" in the subject field
        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Extreme Programming Explained")
Exemplo n.º 9
0
    def testDefaultOperator(self):

        SHOULD = BooleanClause.Occur.SHOULD
        query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                            "development",
                                            ["title", "subject"],
                                            [SHOULD, SHOULD], SimpleAnalyzer())

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(query, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Java Development with Ant")

        # has "development" in the subject field
        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Extreme Programming Explained")
Exemplo n.º 10
0
def find(terms):
    """Use the Lucene index to find monsters"""
    terms = ' '.join(terms)
    searcher = IndexSearcher(STORE)

    SHOULD = BooleanClause.Occur.SHOULD

    query = MultiFieldQueryParser.parse(terms, 
            ['name_', 'full_text'], [SHOULD, SHOULD], StandardAnalyzer())
    hits = searcher.search(query)

    ret = []
    for i, hit in enumerate(hits):
        doc = Hit.cast_(hit).getDocument()
        ret.append(MyHit(doc, hits, i))
        if i == 10:
            break

    return ret
Exemplo n.º 11
0
class QuestionAardvark(object):
    prepositions = ['aboard', 'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'anti', 'around', 'as', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'but', 'by', 'concerning', 'considering', 'despite', 'down', 'except', 'excepting', 'excluding', 'following', 'for', 'from', 'in', 'inside', 'into', 'like', 'near', 'of', 'off', 'on', 'onto', 'opposite', 'outside', 'over', 'past', 'regarding', 'round', 'save', 'since', 'than', 'through', 'to', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'via', 'with', 'within', 'without']
    # pronouns = ['the', 'this', 'that', 'my', 'mine', 'yours', 'his', 'hers', 'its', 'our', 'their']
    # want preposition list in regex of form: '(alpha|beta)(?!.*(alpha|beta)).*'
    prep_phrase = re.compile(r'\b(' + '|'.join(prepositions) + r')\b'
            + r'((?!.+\b(' + '|'.join(prepositions) + r')\b).*)')
    # identifies phrases referring to the user's current location
    current_location = re.compile(r'\b(here|this)\b')

    def __init__(self, user_loc_string, debug=False):
        analyzer = StopAnalyzer()
        fields = ['name', 'alternate_names', 'state']
        directory = FSDirectory.getDirectory("index")

        self.DEBUG = debug
        self.searcher = IndexSearcher(directory)
        self.parser = MultiFieldQueryParser(fields, analyzer)
        self.user_location = self.doSearch(user_loc_string)

    def run(self):
        """ Loops indefinitely and accepts questions from the user.
            Identifies prepositional phrases and returns corresponding
            locations, sorted by population, to the user.
        """
        while True:
            question= raw_input("\nEnter your Aardvark question: ")
            if question == '':
                return

            # logic here to feed into doSearch
            # find all prepositional phrases
            question = question.translate(string.maketrans("",""),
                string.punctuation)

            locations = []
            match = self.prep_phrase.search(question)
            while match is not None:
                if self.DEBUG: print "Found: ", match.group()
                
                doc = self.doSearch(match.group(2))
                if doc is not None:
                    if self.DEBUG: print doc.get('name')
                    locations.append(doc)
                else:
                    cur_loc_ref = self.current_location.search(question)
                    if cur_loc_ref is not None and self.user_location is not None:
                        locations.append(self.user_location)

                question = question[:match.start()]
                match = self.prep_phrase.search(question)

            # sorting locations, one ID'd per prepositional phrase
            locations.sort(compare)
            print [doc.get('name').title() \
                + ', ' + doc.get('state').upper() \
                + ': ' + doc.get('population') for doc in locations]


    def doSearch(self, string):
        """ Does the actual interacting with Lucene """
        try:
            query = self.parser.parse(self.parser, string)
            hits = self.searcher.search(query)

            if hits.length() > 0:
                return hits[0]
            return None
        except JavaError:
            return None
class QuestionAardvark(object):
    prepositions = [
        "aboard",
        "about",
        "above",
        "across",
        "after",
        "against",
        "along",
        "amid",
        "among",
        "anti",
        "around",
        "as",
        "at",
        "before",
        "behind",
        "below",
        "beneath",
        "beside",
        "besides",
        "between",
        "beyond",
        "but",
        "by",
        "concerning",
        "considering",
        "despite",
        "down",
        "except",
        "excepting",
        "excluding",
        "following",
        "for",
        "from",
        "in",
        "inside",
        "into",
        "like",
        "near",
        "of",
        "off",
        "on",
        "onto",
        "opposite",
        "outside",
        "over",
        "past",
        "regarding",
        "round",
        "save",
        "since",
        "than",
        "through",
        "to",
        "toward",
        "towards",
        "under",
        "underneath",
        "unlike",
        "until",
        "up",
        "upon",
        "via",
        "with",
        "within",
        "without",
    ]
    # pronouns = ['the', 'this', 'that', 'my', 'mine', 'yours', 'his', 'hers', 'its', 'our', 'their']
    # want preposition list in regex of form: '(alpha|beta)(?!.*(alpha|beta)).*'
    prep_phrase = re.compile(
        r"\b(" + "|".join(prepositions) + r")\b" + r"((?!.+\b(" + "|".join(prepositions) + r")\b).*)"
    )
    # identifies phrases referring to the user's current location
    current_location = re.compile(r"\b(here|this)\b")

    def __init__(self, user_loc_string, debug=False):
        analyzer = StopAnalyzer()
        fields = ["name", "alternate_names", "state"]
        directory = FSDirectory.getDirectory("index")

        self.DEBUG = debug
        self.searcher = IndexSearcher(directory)
        self.parser = MultiFieldQueryParser(fields, analyzer)
        self.user_location = self.doSearch(user_loc_string)

    def run(self):
        """ Loops indefinitely and accepts questions from the user.
            Identifies prepositional phrases and returns corresponding
            locations, sorted by population, to the user.
        """
        while True:
            question = raw_input("\nEnter your Aardvark question: ")
            if question == "":
                return

            # logic here to feed into doSearch
            # find all prepositional phrases
            question = question.translate(string.maketrans("", ""), string.punctuation)

            locations = []
            match = self.prep_phrase.search(question)
            while match is not None:
                if self.DEBUG:
                    print "Found: ", match.group()

                doc = self.doSearch(match.group(2))
                if doc is not None:
                    if self.DEBUG:
                        print doc.get("name")
                    locations.append(doc)
                else:
                    cur_loc_ref = self.current_location.search(question)
                    if cur_loc_ref is not None and self.user_location is not None:
                        locations.append(self.user_location)

                question = question[: match.start()]
                match = self.prep_phrase.search(question)

            # sorting locations, one ID'd per prepositional phrase
            locations.sort(compare)
            print [
                doc.get("name").title() + ", " + doc.get("state").upper() + ": " + doc.get("population")
                for doc in locations
            ]

    def doSearch(self, string):
        """ Does the actual interacting with Lucene """
        try:
            query = self.parser.parse(self.parser, string)
            hits = self.searcher.search(query)

            if hits.length() > 0:
                return hits[0]
            return None
        except JavaError:
            return None