def testPrefixQuery(self):

        parser = QueryParser(Version.LUCENE_CURRENT, "category",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        parser.setLowercaseExpandedTerms(False)

        print parser.parse("/Computers/technology*").toString("category")
예제 #2
0
    def testPrefixQuery(self):

        parser = QueryParser(Version.LUCENE_CURRENT, "category",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        parser.setLowercaseExpandedTerms(False)

        print parser.parse("/Computers/technology*").toString("category")
예제 #3
0
 def does_line_exist(self,line,x,y):
     """
     Old, more complex function if a sentence already exists in the index.
     Not used in the moment
     """
     return self.does_line_existNew(line, x, y)
     try:
         array = re.findall(r'[\w\s]+',x)
         x = ""
         for item in array:
             x+=item
         qp = QueryParser(Version.LUCENE_35, "X", analyzer)
         qp.setDefaultOperator(qp.Operator.AND)
         query = qp.parse(x)
                     
         MAX = 100000
         hits = searcher.search(query, MAX)
         #First check, if an x already exists
         for hit in hits.scoreDocs:
             doc = searcher.doc(hit.doc)
             y_entry = doc["Y"]
             if y_entry == y:
                 print "y found"
                 print
                 try:
                     array = re.findall(r'[\w\s]+',line)
                     string = ""
                     for item in array:
                         string+=item
                     qp = QueryParser(Version.LUCENE_35, "Sentence", analyzer)
                     qp.setDefaultOperator(qp.Operator.AND)
                     query = qp.parse(string)
                     MAX = 10
                     hits = searcher.search(query, MAX)
                     if len(hits.scoreDocs)>0:
                         return True
                 except Exception:
                     s_tmp =  str(sys.exc_info())
                     if "too many boolean clauses" in s_tmp:
                         print "too many boolean clauses"
                         return True
                     else:
                         print "Unexpected error:", sys.exc_info()[0]
                         print "in does line exist"
                         print s_tmp
         print 'nothing found'
         return False
     except:
         print("Fail (does line exists) in x:"+x+" y:"+y)
         print "Unexpected error:", sys.exc_info()[0]
         print
예제 #4
0
    def search(self, string ,special = None):
        query = ""
        try:
            MAX = 100000
            #for dates such as 1931.08.06
            string = string.replace("."," ")
            
            array = re.findall(r'[\w\s]+',string)
            string = ""
            for item in array:
                string+=item
            qp = QueryParser(Version.LUCENE_35, "title", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(string)
#            print ("query",query)
                        
            hits = searcher.search(query, MAX)
            
            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                sentence_list.append(doc.get("title").encode("utf-8"))
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+string)
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []
예제 #5
0
    def main(cls, argv):

        allBooks = MatchAllDocsQuery()
        parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        query = BooleanQuery()
        query.add(allBooks, BooleanClause.Occur.SHOULD)
        query.add(parser.parse("java OR action"), BooleanClause.Occur.SHOULD)

        indexDir = System.getProperty("index.dir")
        directory = SimpleFSDirectory(File(indexDir))

        example = SortingExample(directory)

        example.displayResults(query, Sort.RELEVANCE)
        example.displayResults(query, Sort.INDEXORDER)
        example.displayResults(query,
                               Sort(SortField("category", SortField.STRING)))
        example.displayResults(query,
                               Sort(SortField("pubmonth", SortField.INT, True)))

        example.displayResults(query,
                               Sort([SortField("category", SortField.STRING),
                                     SortField.FIELD_SCORE,
                                     SortField("pubmonth", SortField.INT, True)]))

        example.displayResults(query,
                               Sort([SortField.FIELD_SCORE,
                                     SortField("category", SortField.STRING)]))
        directory.close()
    def searchForDbpediaURI(self, uri):
        """
        Returns all anchor texts, which are related to the given DBpedia URI.
        Also returns for each anchor text the corresponding URI and the number of how often the anchor appears on the english Wikipedia
        """
        uri_old = uri
        uri = uri.replace("http://dbpedia.org/resource/","")

        array = re.findall(r'[\w\s]+',uri)
        uri = ""
        for item in array:
            uri+=item
        
        try:
            qp = QueryParser(Version.LUCENE_35, "dbpedia_uri", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(uri)
            MAX = 10000
            result = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                dbpedia_uri = doc["dbpedia_uri"].encode("utf-8")
                if dbpedia_uri == uri_old:
                    result.append([doc["anchor"].encode("utf-8"), doc["anchor_uri"].encode("utf-8"), dbpedia_uri, doc["number"].encode("utf-8")])
            return result
        except:
            print("searchForDbpediaURI - Fail in uri: "+uri)
            return []
예제 #7
0
    def searchForDbpediaURI(self, uri):
        """
        Returns all sentences, which are tagged with the given DBpedia URI
        """
        print "in searchForDbpediaURI" 
        uri_old = uri
        uri = uri.replace("http://dbpedia.org/ontology/","")
        uri = uri.replace("http://dbpedia.org/property/","")
        uri = uri.replace("http://dbpedia.org/resource/","")

        array = re.findall(r'[\w\s]+',uri)
        uri = ""
        for item in array:
            uri+=item
        
        try:
            qp = QueryParser(Version.LUCENE_35, "URI", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(uri)
            print "query: "+str(query)
            MAX = 500000
            result = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                dbpedia_uri = doc["URI"]
                if dbpedia_uri == uri_old:
                    result.append([IndexUtils.sentence_wrapper(doc["Sentence"]), doc["X"], doc["Y"],dbpedia_uri])
            return result
        except:
            print("Fail in uri: "+uri)
            print "Unexpected error:", sys.exc_info()[0]
            return result
예제 #8
0
 def does_line_existNew(self,line,x,y):
     """
     Checks, if parsed sentence already exists in index
     """
     query = ""
     try:
         array = re.findall(r'[\w]+',line)
         string = ""
         for item in array:
             string+=item+" "
         qp = QueryParser(Version.LUCENE_35, "Sentence", analyzer)
         qp.setDefaultOperator(qp.Operator.AND)
         query = qp.parse(string)
         
         MAX = 10
         hits = searcher.search(query, MAX)
         if len(hits.scoreDocs)>0:
             return True
         else:
             return False
     except Exception:
         s_tmp =  str(sys.exc_info())
         if "too many boolean clauses" in s_tmp:
             print "too many boolean clauses"
             """
             Returns true, so that the sentence is not added each time, to avoid further error messages.
             Only occours with very large sentences.
             """
             return True
         else:
             print "Unexpected error:", sys.exc_info()[0]
             print "in does line exist"
             print s_tmp
     return False
    def searchString(self, string):
        'searches for a string and returns an array of POS-tagged sentences'
        query = ""
        #print("Input String: ",string)
        try:
            MAX = 100000
            #for dates such as 1931.08.06
            string = string.replace("."," ")
            
            array = re.findall(r'[\w\s]+',string)
            string = ""
            for item in array:
                string+=item
            #print("Input String2: ",string)
            qp = QueryParser(Version.LUCENE_35, "sentence", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(string)
            #print ("query",query)
                        
            hits = searcher.search(query, MAX)
            #print len(hits)
            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                #print doc.get("sentence")
                sentence_list.append(eval(doc.get("sentence").encode("utf-8")))
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+string+" in search term")
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []
예제 #10
0
def search(r, keyword=""):
    import logging

    logger = logging.getLogger("search")
    bench = Benchmark(logger)
    from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit
    import lucene, os

    os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17"
    lucene.initVM(lucene.CLASSPATH)

    directory = FSDirectory.open(File(CONFIG.INDEX_PATH))
    ROBOT_INDEX = IndexSearcher(directory, True)
    ROBOT_ANALYZER = StandardAnalyzer()

    keyword = keyword or r.GET["keyword"]
    query = QueryParser("context", ROBOT_ANALYZER)
    query = query.parse('"%s"' % keyword)

    bench.start_mark("search")
    hits = ROBOT_INDEX.search(query)
    count = len(hits)
    result = []
    i = 0
    for hit in hits:
        i += 1
        if i > 100:
            break
        doc = Hit.cast_(hit).getDocument()
        result.append(SearchResult(doc, i, keyword))
    ROBOT_INDEX.close()

    et = bench.stop_mark()

    return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
예제 #11
0
    def searchXYPair(self,x,y):
        """
        Returns all sentences, which are tagged with the given two entities (x,y)
        """
        tmp_hm = {}
        if x == "" or y == "":
            return []
        try:
            array = re.findall(r'[\w\s]+',x)
            x = ""
            for item in array:
                x+=item
            qp = QueryParser(Version.LUCENE_35, "X", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(x)
            MAX = 100000
            result_list = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                y_entry = doc["Y"]
                if y_entry == y:
                    tmp_hm[doc["Sentence"]]=""
                    
            for key in tmp_hm:
                result_list.append(IndexUtils.sentence_wrapper(key))
            tmp_hm = {}
            return result_list
        except:
            print("Fail (search XYPair) in x:"+x+" y:"+y)
            print "Unexpected error:", sys.exc_info()[0]
            print

            
        return []
예제 #12
0
    def query(indexName, queryString):

        indSearcher = IndexSearcher(SimpleFSDirectory(File(indexName)))
        qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT))
        qp.setDefaultOperator(qp.Operator.AND)
         
        query = qp.parse(queryString.replace("-","_"))
                
        aux = indSearcher.search(query, 100)
        results = aux.scoreDocs
        hits = aux.totalHits
        
        ir = indSearcher.getIndexReader()

        #results = collector.topDocs()
        i = 0

        res = []
    
        for r in results:        
            doc = ir.document(i)
            res.insert(i, doc.get('id'))
            i+=1
            
        return res
    def searchKey(self, key , rank = None):
        query = ""
        try:
            MAX = 100000
            qp = QueryParser(Version.LUCENE_35, "key", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(key)
#             print ("query",query)
                        
            hits = searcher.search(query, MAX)

            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                try:
                    sentence_list.append(eval(doc.get("sentence").encode("utf-8")))
                except:
                    print doc.get("sentence")
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+key)
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []
예제 #14
0
    def testSlop(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"exact phrase"')
        self.assertEqual('"exact phrase"', q.toString("field"), "zero slop")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setPhraseSlop(5)
        q = qp.parse('"sloppy phrase"')
        self.assertEqual('"sloppy phrase"~5', q.toString("field"), "sloppy, implicitly")
예제 #15
0
    def testLowercasing(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse("PrefixQuery*")
        self.assertEqual("prefixquery*", q.toString("field"), "lowercased")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setLowercaseExpandedTerms(False)
        q = qp.parse("PrefixQuery*")
        self.assertEqual("PrefixQuery*", q.toString("field"), "not lowercased")
예제 #16
0
def extractFeatureQueryWords(query):
    import string
    from lucene import Document, TermQuery, Term
    
    # create analyzer
    aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    try:
        file = open('../features.txt', 'r')
        
        featurelist = []
        for line in file.readlines():
            words_in_line = line.split()
            featurelist += words_in_line
             
        querywordlist = query.split()
        
        featureQueryList = []
        productQueryList = []
        
        for word in querywordlist:
            if word in featurelist:
                featureQueryList.append(word)
            else:
                # create parser for word
                aux_parser = QueryParser(Version.LUCENE_CURRENT, "title", aux_analyzer)
                aux_query = aux_parser.parse(word)
                scoreDocs = searcher.search(aux_query, 50).scoreDocs
                if scoreDocs:
                    productQueryList.append(word)

        
        featureQuery = ""
        if featureQueryList:
            featureQuery = "("
            for i in range(len(featureQueryList)):
                if i == len(featureQueryList) - 1:
                    featureQuery += featureQueryList[i] + ")"
                else:
                    featureQuery += featureQueryList[i] + " AND "
                
            print featureQuery
        
        productQuery = ""
        if productQueryList:
            productQuery = "("
            for i in range(len(productQueryList)):
                if i == len(productQueryList) - 1:
                    productQuery += productQueryList[i] + ")"
                else:
                    productQuery += productQueryList[i] + " AND "
            
        return (featureQuery, productQuery, featureQueryList, productQueryList)
    except Exception, ex:
        print "Could not separate feature query words. Reason: ", ex
        return ("", "(" + query + ")", [], querywordlist)
예제 #17
0
def extractFeatureQueryWords(query):
    import string
    from lucene import Document, TermQuery, Term

    # create analyzer
    aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    try:
        file = open('../features.txt', 'r')

        featurelist = []
        for line in file.readlines():
            words_in_line = line.split()
            featurelist += words_in_line

        querywordlist = query.split()

        featureQueryList = []
        productQueryList = []

        for word in querywordlist:
            if word in featurelist:
                featureQueryList.append(word)
            else:
                # create parser for word
                aux_parser = QueryParser(Version.LUCENE_CURRENT, "title",
                                         aux_analyzer)
                aux_query = aux_parser.parse(word)
                scoreDocs = searcher.search(aux_query, 50).scoreDocs
                if scoreDocs:
                    productQueryList.append(word)

        featureQuery = ""
        if featureQueryList:
            featureQuery = "("
            for i in range(len(featureQueryList)):
                if i == len(featureQueryList) - 1:
                    featureQuery += featureQueryList[i] + ")"
                else:
                    featureQuery += featureQueryList[i] + " AND "

            print featureQuery

        productQuery = ""
        if productQueryList:
            productQuery = "("
            for i in range(len(productQueryList)):
                if i == len(productQueryList) - 1:
                    productQuery += productQueryList[i] + ")"
                else:
                    productQuery += productQueryList[i] + " AND "

        return (featureQuery, productQuery, featureQueryList, productQueryList)
    except Exception, ex:
        print "Could not separate feature query words. Reason: ", ex
        return ("", "(" + query + ")", [], querywordlist)
예제 #18
0
    def testLowercasing(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse("PrefixQuery*")
        self.assertEqual("prefixquery*", q.toString("field"), "lowercased")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setLowercaseExpandedTerms(False)
        q = qp.parse("PrefixQuery*")
        self.assertEqual("PrefixQuery*", q.toString("field"), "not lowercased")
예제 #19
0
def getResultScoreDocs(query):
    # create analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    # create parser for user submitted query
    parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    formatted_query = parser.parse(query)
    scoreDocs = searcher.search(formatted_query, 50).scoreDocs
    
    return scoreDocs
예제 #20
0
def getResultScoreDocs(query):
    # create analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # create parser for user submitted query
    parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    formatted_query = parser.parse(query)
    scoreDocs = searcher.search(formatted_query, 50).scoreDocs

    return scoreDocs
예제 #21
0
    def testSlop(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse('"exact phrase"')
        self.assertEqual("\"exact phrase\"", q.toString("field"), "zero slop")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setPhraseSlop(5)
        q = qp.parse('"sloppy phrase"')
        self.assertEqual("\"sloppy phrase\"~5", q.toString("field"),
                         "sloppy, implicitly")
class IndexSearcherWrapper(object):
    def __init__(self, location):
        lucene.initVM()
        directory = SimpleFSDirectory(File(location))
        self.reader = IndexReader.open(directory, True)
        self.searcher = IndexSearcher(self.reader)
        self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text", WhitespaceAnalyzer())

    def search(self, topic, max=5000):
        query = self.query_parser.parse(topic.title)
        return self.searcher.search(query, max)
예제 #23
0
def build_advanced_search_query(params, operator, analyzer):
    """
    Takes a dictionary containing key=value pairs where keys are fields in our
    lucene document and values are search terms provided by the user. A 
    BooleanQuery is built from these key=value pairs
    """
    parser = QueryParser(Version.LUCENE_CURRENT, "name", analyzer)        
    query_list = ["%s:\"%s\"" % (field, process_query_param(val)) 
                                   for (field, val) in 
                                        get_adv_query_packet(params)]

    return parser.parse("%s" % (" " + operator + " ").join(query_list))        
class IndexSearcherWrapper(object):
    def __init__(self, location):
        lucene.initVM()
        directory = SimpleFSDirectory(File(location))
        self.reader = IndexReader.open(directory, True)
        self.searcher = IndexSearcher(self.reader)
        self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text",
                                        WhitespaceAnalyzer())

    def search(self, topic, max=5000):
        query = self.query_parser.parse(topic.title)
        return self.searcher.search(query, max)
예제 #25
0
def pesquisar_com_lucene():
    initVM()
    #print 'lucene', VERSION

    # Get handle to index directory
    directory = SimpleFSDirectory(File(STORE_DIR))

    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)

    for query in querys:
        query_number =  query.query_number
        # Constructs a query parser. We specify what field to search into.
        query.query_text = query.query_text.replace('?','')
        query.query_text = query.query_text.replace('*','')
        queryParser = QueryParser(Version.LUCENE_CURRENT,
                                  FIELD_CONTENTS, analyzer)

        # Create the query
        query = queryParser.parse(query.query_text)

        # Run the query and get top 50 results
        topDocs = searcher.search(query,50000)

        # Get top hits
        scoreDocs = topDocs.scoreDocs

        r = resultado_query(query_number,scoreDocs)
        resultados.append(r)
        #print "%s total matching documents." % len(scoreDocs)
        #for scoreDoc in scoreDocs:
        #    doc = searcher.doc(scoreDoc.doc)
        #    print doc.get(FIELD_PATH)

    with open('resultados_da_busca/resultados.csv', 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in resultados:
            resultados_da_row = []
            i = 1
            for resultado_da_query in row.query_results:
                doc = searcher.doc(resultado_da_query.doc)
                resultados_da_row.append((i,int(doc.get(FIELD_PATH))))
                i = i + 1
            spamwriter.writerow([row.query_number,resultados_da_row])
예제 #26
0
    def testWithSlop(self):

        searcher = IndexSearcher(self.directory, True)

        parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                             self.porterAnalyzer)
        parser.setPhraseSlop(1)

        query = parser.parse('"over the lazy"')
        topDocs = searcher.search(query, 50)

        self.assertEqual(1, topDocs.totalHits, "hole accounted for")
예제 #27
0
    def testDateRangeQuery(self):

        # locale diff between jre and gcj 1/1/04 -> 01/01/04
        # expression = "modified:[1/1/04 TO 12/31/04]"

        expression = "modified:[01/01/04 TO 12/31/04]"
        parser = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer)
        parser.setLocale(Locale.US)
        query = parser.parse(expression)
        print expression, "parsed to", query

        topDocs = self.searcher.search(query, 50)
        self.assert_(topDocs.totalHits > 0)
예제 #28
0
    def testDateRangeQuery(self):

        # locale diff between jre and gcj 1/1/04 -> 01/01/04
        # expression = "modified:[1/1/04 TO 12/31/04]"

        expression = "modified:[01/01/04 TO 12/31/04]"
        parser = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer)
        parser.setLocale(Locale.US)
        query = parser.parse(expression)
        print expression, "parsed to", query

        topDocs = self.searcher.search(query, 50)
        self.assert_(topDocs.totalHits > 0)
예제 #29
0
def build_advanced_search_query(params, operator, analyzer):
    """
    Takes a dictionary containing key=value pairs where keys are fields in our
    lucene document and values are search terms provided by the user. A 
    BooleanQuery is built from these key=value pairs
    """
    parser = QueryParser(Version.LUCENE_CURRENT, "name", analyzer)
    query_list = [
        "%s:\"%s\"" % (field, process_query_param(val))
        for (field, val) in get_adv_query_packet(params)
    ]

    return parser.parse("%s" % (" " + operator + " ").join(query_list))
예제 #30
0
파일: searcher.py 프로젝트: fay/wt
 def search(self, query,category_id=None):
     SHOULD = BooleanClause.Occur.SHOULD
     #MultiFieldQueryParser.setOperator(QueryParser.DEFAULT_OPERATOR_AND);
     parser1 = QueryParser('summary',self.analyzer)
     parser2 = QueryParser('title',self.analyzer)        
     parser1.setDefaultOperator(QueryParser.AND_OPERATOR)
     parser2.setDefaultOperator(QueryParser.AND_OPERATOR)
     q1 = parser1.parse(query)
     q2 = parser2.parse(query)
     boolQuery = BooleanQuery()
     boolQuery.add(q1,SHOULD)
     boolQuery.add(q2,SHOULD)
     
     #camp = CategoryComparatorSource(query)
     #sortfield = SortField("link", camp)
     #sort = Sort(sortfield)
     if category_id:
         self.catfilter.query = query
         self.catfilter.category_id = category_id
         hits = self.searcher.search(boolQuery,self.catfilter)
     else:
         hits = self.searcher.search(boolQuery)
     return hits
예제 #31
0
파일: plush.py 프로젝트: jjguy/plush
 def search(self, command, field_id="contents", sort_on=None,
            sort_order=False, analyzer_id=None):
     """Do the lucene search."""
     analyzer = self.getAnalyzer(analyzer_id)
     try:
         if VERSION.startswith('1.9'):
             query = QueryParser.parse(command, field_id, analyzer)
         else:
             query = QueryParser(field_id, analyzer).parse(command)
     except JavaError:
         print "Error: Lucene cannot parse this query."
         return None
     if sort_on:
         return self.searcher.search(query, Sort(sort_on, sort_order))
     return self.searcher.search(query)
예제 #32
0
    def query(indexName, queryFile, runName):
        indReader = IndexReader.open(SimpleFSDirectory(File(indexName)))
        indSearcher = IndexSearcher(indReader)
        ir = indSearcher.getIndexReader()

        qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT))

        f = open('results-'+runName, 'w')

        while(True):
            id = queryFile.readline()

            if id == "":
                break

            id = id.replace("C","")
            id = id.replace("\n","")

            queryString = queryFile.readline()
            queryString = queryString.replace("?","")
            queryString = queryString.replace("*","")
            queryString = queryString.replace("-","_")
            queryString = queryString.replace("\n","")

            query = qp.parse(queryString)

            queryFile.readline()

            returnedDocs = 1000
            collector = TopScoreDocCollector.create(returnedDocs, True)

            indSearcher.search(query, collector)

            hits = collector.topDocs().scoreDocs

            size = len(hits)
            print "Total hits for query " +id+ ": "+str(size)

            i = 0
            for hit in hits:        
                docId = hits[i].doc
                score = hits[i].score
                doc = ir.document(docId)
                j = i + 1
                f.write(id + " 0 " + doc.get('id') + " " + str(j) + " " + str(score) +" " + runName +"\n")
                i+=1

        f.close()
예제 #33
0
def run(command):
    if command == '':
        return None
    STORE_DIR = "index"
    initVM(CLASSPATH)
    directory = FSDirectory.getDirectory(STORE_DIR, False)
    searcher = IndexSearcher(directory)
    analyzer = StandardAnalyzer()
    
    parser = QueryParser("contents", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    parser.setFuzzyMinSim(0.2)
    query = parser.parse(command)
    hits = map(transform, searcher.search(query))
    searcher.close()
    return hits
예제 #34
0
def boolean_search_lucene_index(index_dir, query_text, limit):
    '''
    This function searches a boolean query in the learned lucene index 
    
    Arguments: 
        index_dir - the lucene index directory 
        query_text - the query text which follows http://lucene.apache.org/core/3_6_0/queryparsersyntax.html
        limit - the number of records to be retrieved 
    Return: 
        rows - the returned document details 

    
    '''
    DEFAULT_QUERY_FIELD = 'all'
    
    
    store = SimpleFSDirectory(File(index_dir))
    
    searcher = IndexSearcher(store, True)
    parser = QueryParser(Version.LUCENE_CURRENT, DEFAULT_QUERY_FIELD, STD_ANALYZER)
    query = parser.parse(query_text)
    
    start = datetime.datetime.now()
    scoreDocs = searcher.search(query, limit).scoreDocs
    duration = datetime.datetime.now() - start
    
    # print "Lucene Search: Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

    
    rows = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        table = dict((field.name(), field.stringValue())
                     for field in doc.getFields())
        row = []
        metadata = MetadataType._types
        for field in metadata:
            if table.get(field,'empty') != 'empty' :
                row.append(table.get(field,'empty'))
            else: 
                row.append('')
        row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file 
        row.append(scoreDoc.score)
        
        rows.append(row)
    
    return rows
예제 #35
0
    def someMethod(self):

        directory = RAMDirectory()

        analyzer = StandardAnalyzer()
        writer = IndexWriter(directory, analyzer, True)

        doc = Document()
        doc.add(Field.Text("title", "This is the title"))
        doc.add(Field.UnStored("contents", "...document contents..."))
        writer.addDocument(doc)

        writer.addDocument(doc, analyzer)

        expression = "some query"

        query = QueryParser.parse(expression, "contents", analyzer)

        parser = QueryParser("contents", analyzer)
        query = parser.parseQuery(expression)
예제 #36
0
    def testAnalyzer(self):

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryString = "category:/philosophy/eastern"

        parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        parser.setAutoGeneratePhraseQueries(True)
        query = parser.parse(queryString)

        self.assertEqual("category:\"philosophy eastern\"",
                         query.toString("contents"), "path got split, yikes!")

        perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer)
        perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer())
        query = QueryParser(Version.LUCENE_CURRENT,
                            "contents", perFieldAnalyzer).parse(queryString)

        self.assertEqual("category:/philosophy/eastern",
                         query.toString("contents"),
                         "leave category field alone")
예제 #37
0
    def someMethod(self):

        directory = RAMDirectory()

        analyzer = StandardAnalyzer()
        writer = IndexWriter(directory, analyzer, True)

        doc = Document()
        doc.add(Field.Text("title", "This is the title"))
        doc.add(Field.UnStored("contents", "...document contents..."))
        writer.addDocument(doc)

        writer.addDocument(doc, analyzer)

        expression = "some query"

        query = QueryParser.parse(expression, "contents", analyzer)

        parser = QueryParser("contents", analyzer)
        query = parser.parseQuery(expression)
예제 #38
0
    def testAnalyzer(self):

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryString = "category:/philosophy/eastern"

        parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        parser.setAutoGeneratePhraseQueries(True)
        query = parser.parse(queryString)

        self.assertEqual("category:\"philosophy eastern\"",
                         query.toString("contents"), "path got split, yikes!")

        perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer)
        perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer())
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            perFieldAnalyzer).parse(queryString)

        self.assertEqual("category:/philosophy/eastern",
                         query.toString("contents"),
                         "leave category field alone")
예제 #39
0
    def testDateRangeQuery(self):

        # locale diff between jre and gcj 1/1/04 -> 01/01/04
        # expression = "modified:[1/1/04 TO 12/31/04]"
        
        #expression = "modified:[01/01/04 TO 31/12/04]"
        #parser = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer)
        #parser.setLocale(Locale("en_US.utf8"))
        #query = parser.parse(expression)
        #print expression, "parsed to", query
        
        #TODO: locale problem currently...
        expression = "modified:[04/01/01 TO 04/12/31]"
        parser = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer)
        parser.setLocale(Locale("en_US.utf8"))
        query = parser.parse(expression)
        print expression, "parsed to", query
        
        topDocs = self.searcher.search(query, 50)
        #self.assert_(topDocs.totalHits > 0) no docs?
        self.assert_(str(expression) != str(query))
    def searchForDbpediaURImax(self, uri, number):
        """
        Returns maximal the number of anchor texts, which are related to the given DBpedia URI.
        Also returns for each anchor text the corresponding URI and the number of how often the anchor appears on the English Wikipedia
        """
        uri_old = uri
        uri = uri.replace("http://dbpedia.org/resource/","")

        array = re.findall(r'[\w\s]+',uri)
        uri = ""
        for item in array:
            uri+=item
        
        try:
            qp = QueryParser(Version.LUCENE_35, "dbpedia_uri", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(uri)
            MAX = 10000
            result = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                dbpedia_uri = doc["dbpedia_uri"].encode("utf-8")
                if dbpedia_uri == uri_old:
                    result.append([doc["anchor"].encode("utf-8"), doc["anchor_uri"].encode("utf-8"), dbpedia_uri, int(doc["number"].encode("utf-8"))])
            result = sorted(result, key = itemgetter(3), reverse=True)
            if len(result) > number:
                return result[0:number]
        
            else:
                return result
            
            return result
        except:
            print("searchForDbpediaURImax - Fail in uri: "+uri)
            print "Unexpected error:", sys.exc_info()[0]
#            raise
            print
            return []
예제 #41
0
def run(searcher, analyzer, input, filepath):
        #input = raw_input("Query:").decode('gbk').encode('utf8')
	#print "Search for: " + input
	command = convert(input.decode('gbk').encode('utf8'))
	print "Search for:" + command.decode('utf8').encode('gbk')
        qp = QueryParser(Version.LUCENE_CURRENT, "sentence", analyzer)
        #qp.setPhraseSlop(0)
        query = qp.parse(command)
        scoreDocs = searcher.search(query, 1000000).scoreDocs
        print "%s total matching documents." % len(scoreDocs)
        print
        
	try:
		#filepath = "D:\\TotalCode\\PyluceneSample\\Output_pylucene.txt"
		filew = open(filepath, 'w')
		result_num = 0
        	for scoreDoc in scoreDocs:
			try:
				result_num += 1
				if result_num % 1000 == 0:
				#	time.sleep(5)
		    			print "Search added " + str(result_num) + " sentences..."
			#print 'scoreDoc.doc:', scoreDoc.doc
            			doc = searcher.doc(scoreDoc.doc)
	    			path = doc.get("path")
				#print "path:" +  path
			#print 'name:', doc.get("name")
	    		#print 'sentence_num:', str(doc.get("sentence_num"))
	    		#print 'sentence:', doc.get("sentence")
				#sentence = GetSentence(doc.get("sentence_num"), path)
				sentence = doc.get("sentence")
	    			#print 'sentence:', sentence
				OutputSentence(filew, doc.get("name"), sentence)
			except:
				continue
		filew.close()
	except: #Exception, e:
		print "Failed in Outputsentence:"#, e
예제 #42
0
# Creates a searcher searching the provided index.
ireader = IndexReader.open(directory, True)
# Implements search over a single IndexReader.
# Use a single instance and use it across queries
# to improve performance.
searcher = IndexSearcher(ireader)
# Get the analyzer
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
# Constructs a query parser. We specify what field to search into.
queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

appearance_dict = {}
for TERM in term_list:
    print 'Searching for: "' + TERM + '"'
    # Create the query
    query = queryParser.parse(TERM)

    # Run the query and get documents that contain the term
    docs_containing_term = searcher.search(query, ireader.numDocs())

    docs = []

    print 'Found ' + str(len(docs_containing_term.scoreDocs)
                         ) + ' documents with the term "' + TERM + '".'
    #hits = searcher.search(query, 1)
    for hit in (docs_containing_term.scoreDocs):
        #print(hit.score, hit.doc, hit.toString())
        doc = searcher.doc(hit.doc)
        docs.append(doc.get(DOC_NAME))
    appearance_dict[TERM] = set(docs)
"""
예제 #43
0
        format = a
    elif o == "--index":
        indexDir = a
    elif o == "--stats":
        stats = True


class CustomTemplate(Template):
    delimiter = '#'

template = CustomTemplate(format)

fsDir = SimpleFSDirectory(File(indexDir))
searcher = IndexSearcher(fsDir, True)

analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(' '.join(args))
start = datetime.now()
scoreDocs = searcher.search(query, 50).scoreDocs
duration = datetime.now() - start
if stats:
    print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    table = dict((field.name(), field.stringValue())
                 for field in doc.getFields())
    print template.substitute(table)
예제 #44
0
    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # Constructs a query parser.
    queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

    # Create a query
    query = queryParser.parse(QUERY_STRING)

    topDocs = searcher.search(query, 50)

    # Get top hits
    scoreDocs = topDocs.scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    HighlightFormatter = SimpleHTMLFormatter()
    query_score = QueryScorer (query)

    highlighter = Highlighter(HighlightFormatter, query_score)

    # Set the fragment size. We break text in to fragment of 64 characters
    fragmenter  = SimpleSpanFragmenter(query_score, 64)
    highlighter.setTextFragmenter(fragmenter)