Exemplos de QueryParser em Python, exemplos de lucene.QueryParser em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: TaggedIndex.py Projeto: swalter2/knowledgeLexicalisation

    def searchKey(self, key , rank = None):
        query = ""
        try:
            MAX = 100000
            qp = QueryParser(Version.LUCENE_35, "key", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(key)
#             print ("query",query)
                        
            hits = searcher.search(query, MAX)

            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                try:
                    sentence_list.append(eval(doc.get("sentence").encode("utf-8")))
                except:
                    print doc.get("sentence")
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+key)
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []

Exemplo n.º 2

0

Exibir arquivo

Arquivo: views.py Projeto: dalinhuang/demodemo

def search(r, keyword=""):
    import logging

    logger = logging.getLogger("search")
    bench = Benchmark(logger)
    from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit
    import lucene, os

    os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17"
    lucene.initVM(lucene.CLASSPATH)

    directory = FSDirectory.open(File(CONFIG.INDEX_PATH))
    ROBOT_INDEX = IndexSearcher(directory, True)
    ROBOT_ANALYZER = StandardAnalyzer()

    keyword = keyword or r.GET["keyword"]
    query = QueryParser("context", ROBOT_ANALYZER)
    query = query.parse('"%s"' % keyword)

    bench.start_mark("search")
    hits = ROBOT_INDEX.search(query)
    count = len(hits)
    result = []
    i = 0
    for hit in hits:
        i += 1
        if i > 100:
            break
        doc = Hit.cast_(hit).getDocument()
        result.append(SearchResult(doc, i, keyword))
    ROBOT_INDEX.close()

    et = bench.stop_mark()

    return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})

Exemplo n.º 3

0

Exibir arquivo

Arquivo: query.py Projeto: KasperBrandt/UvA-AIR

    def query(indexName, queryString):

        indSearcher = IndexSearcher(SimpleFSDirectory(File(indexName)))
        qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT))
        qp.setDefaultOperator(qp.Operator.AND)
         
        query = qp.parse(queryString.replace("-","_"))
                
        aux = indSearcher.search(query, 100)
        results = aux.scoreDocs
        hits = aux.totalHits
        
        ir = indSearcher.getIndexReader()

        #results = collector.topDocs()
        i = 0

        res = []
    
        for r in results:        
            doc = ir.document(i)
            res.insert(i, doc.get('id'))
            i+=1
            
        return res

Exemplo n.º 4

0

Exibir arquivo

Arquivo: QueryParserTest.py Projeto: ustramooner/python-lucenepp

    def testPrefixQuery(self):

        parser = QueryParser(Version.LUCENE_CURRENT, "category",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        parser.setLowercaseExpandedTerms(False)

        print parser.parse("/Computers/technology*").toString("category")

Exemplo n.º 5

0

Exibir arquivo

Arquivo: Index.py Projeto: swalter2/knowledgeLexicalisation

    def search(self, string ,special = None):
        query = ""
        try:
            MAX = 100000
            #for dates such as 1931.08.06
            string = string.replace("."," ")
            
            array = re.findall(r'[\w\s]+',string)
            string = ""
            for item in array:
                string+=item
            qp = QueryParser(Version.LUCENE_35, "title", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(string)
#            print ("query",query)
                        
            hits = searcher.search(query, MAX)
            
            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                sentence_list.append(doc.get("title").encode("utf-8"))
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+string)
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []

Exemplo n.º 6

0

Exibir arquivo

Arquivo: LiveIndex.py Projeto: swalter2/knowledgeLexicalisation

    def searchXYPair(self,x,y):
        """
        Returns all sentences, which are tagged with the given two entities (x,y)
        """
        tmp_hm = {}
        if x == "" or y == "":
            return []
        try:
            array = re.findall(r'[\w\s]+',x)
            x = ""
            for item in array:
                x+=item
            qp = QueryParser(Version.LUCENE_35, "X", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(x)
            MAX = 100000
            result_list = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                y_entry = doc["Y"]
                if y_entry == y:
                    tmp_hm[doc["Sentence"]]=""
                    
            for key in tmp_hm:
                result_list.append(IndexUtils.sentence_wrapper(key))
            tmp_hm = {}
            return result_list
        except:
            print("Fail (search XYPair) in x:"+x+" y:"+y)
            print "Unexpected error:", sys.exc_info()[0]
            print

            
        return []

Exemplo n.º 7

0

Exibir arquivo

Arquivo: AnchorIndex.py Projeto: swalter2/knowledgeLexicalisation

    def searchForDbpediaURI(self, uri):
        """
        Returns all anchor texts, which are related to the given DBpedia URI.
        Also returns for each anchor text the corresponding URI and the number of how often the anchor appears on the english Wikipedia
        """
        uri_old = uri
        uri = uri.replace("http://dbpedia.org/resource/","")

        array = re.findall(r'[\w\s]+',uri)
        uri = ""
        for item in array:
            uri+=item
        
        try:
            qp = QueryParser(Version.LUCENE_35, "dbpedia_uri", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(uri)
            MAX = 10000
            result = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                dbpedia_uri = doc["dbpedia_uri"].encode("utf-8")
                if dbpedia_uri == uri_old:
                    result.append([doc["anchor"].encode("utf-8"), doc["anchor_uri"].encode("utf-8"), dbpedia_uri, doc["number"].encode("utf-8")])
            return result
        except:
            print("searchForDbpediaURI - Fail in uri: "+uri)
            return []

Exemplo n.º 8

0

Exibir arquivo

Arquivo: LiveIndex.py Projeto: swalter2/knowledgeLexicalisation

    def searchForDbpediaURI(self, uri):
        """
        Returns all sentences, which are tagged with the given DBpedia URI
        """
        print "in searchForDbpediaURI" 
        uri_old = uri
        uri = uri.replace("http://dbpedia.org/ontology/","")
        uri = uri.replace("http://dbpedia.org/property/","")
        uri = uri.replace("http://dbpedia.org/resource/","")

        array = re.findall(r'[\w\s]+',uri)
        uri = ""
        for item in array:
            uri+=item
        
        try:
            qp = QueryParser(Version.LUCENE_35, "URI", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(uri)
            print "query: "+str(query)
            MAX = 500000
            result = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                dbpedia_uri = doc["URI"]
                if dbpedia_uri == uri_old:
                    result.append([IndexUtils.sentence_wrapper(doc["Sentence"]), doc["X"], doc["Y"],dbpedia_uri])
            return result
        except:
            print("Fail in uri: "+uri)
            print "Unexpected error:", sys.exc_info()[0]
            return result

Exemplo n.º 9

0

Exibir arquivo

Arquivo: LiveIndex.py Projeto: swalter2/knowledgeLexicalisation

 def does_line_existNew(self,line,x,y):
     """
     Checks, if parsed sentence already exists in index
     """
     query = ""
     try:
         array = re.findall(r'[\w]+',line)
         string = ""
         for item in array:
             string+=item+" "
         qp = QueryParser(Version.LUCENE_35, "Sentence", analyzer)
         qp.setDefaultOperator(qp.Operator.AND)
         query = qp.parse(string)
         
         MAX = 10
         hits = searcher.search(query, MAX)
         if len(hits.scoreDocs)>0:
             return True
         else:
             return False
     except Exception:
         s_tmp =  str(sys.exc_info())
         if "too many boolean clauses" in s_tmp:
             print "too many boolean clauses"
             """
             Returns true, so that the sentence is not added each time, to avoid further error messages.
             Only occours with very large sentences.
             """
             return True
         else:
             print "Unexpected error:", sys.exc_info()[0]
             print "in does line exist"
             print s_tmp
     return False

Exemplo n.º 10

0

Exibir arquivo

Arquivo: TaggedIndex.py Projeto: swalter2/knowledgeLexicalisation

    def searchString(self, string):
        'searches for a string and returns an array of POS-tagged sentences'
        query = ""
        #print("Input String: ",string)
        try:
            MAX = 100000
            #for dates such as 1931.08.06
            string = string.replace("."," ")
            
            array = re.findall(r'[\w\s]+',string)
            string = ""
            for item in array:
                string+=item
            #print("Input String2: ",string)
            qp = QueryParser(Version.LUCENE_35, "sentence", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(string)
            #print ("query",query)
                        
            hits = searcher.search(query, MAX)
            #print len(hits)
            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                #print doc.get("sentence")
                sentence_list.append(eval(doc.get("sentence").encode("utf-8")))
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+string+" in search term")
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []

Exemplo n.º 11

0

Exibir arquivo

Arquivo: SortingExample.py Projeto: bpgriner01/pylucene

    def main(cls, argv):

        allBooks = MatchAllDocsQuery()
        parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        query = BooleanQuery()
        query.add(allBooks, BooleanClause.Occur.SHOULD)
        query.add(parser.parse("java OR action"), BooleanClause.Occur.SHOULD)

        indexDir = System.getProperty("index.dir")
        directory = SimpleFSDirectory(File(indexDir))

        example = SortingExample(directory)

        example.displayResults(query, Sort.RELEVANCE)
        example.displayResults(query, Sort.INDEXORDER)
        example.displayResults(query,
                               Sort(SortField("category", SortField.STRING)))
        example.displayResults(query,
                               Sort(SortField("pubmonth", SortField.INT, True)))

        example.displayResults(query,
                               Sort([SortField("category", SortField.STRING),
                                     SortField.FIELD_SCORE,
                                     SortField("pubmonth", SortField.INT, True)]))

        example.displayResults(query,
                               Sort([SortField.FIELD_SCORE,
                                     SortField("category", SortField.STRING)]))
        directory.close()

Exemplo n.º 12

0

Exibir arquivo

Arquivo: QueryParserTest.py Projeto: bpgriner01/pylucene

    def testPhraseQuery(self):

        analyzer = StandardAnalyzer(Version.LUCENE_24)
        q = QueryParser(Version.LUCENE_24, "field", analyzer).parse('"This is Some Phrase*"')
        self.assertEqual('"some phrase"', q.toString("field"), "analyzed")

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"term"')
        self.assert_(TermQuery.instance_(q), "reduced to TermQuery")

Exemplo n.º 13

0

Exibir arquivo

Arquivo: SynonymAnalyzerTest.py Projeto: chihyu14/pylucene

    def main(cls):

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            cls.synonymAnalyzer).parse('"fox jumps"')
        print "\"fox jumps\" parses to ", query.toString("content")

        print "From AnalyzerUtils.tokensFromAnalysis: "
        AnalyzerUtils.displayTokens(cls.synonymAnalyzer, "\"fox jumps\"")
        print ''

Exemplo n.º 14

0

Exibir arquivo

Arquivo: utils.py Projeto: asorici/review-search-engine

def extractFeatureQueryWords(query):
    import string
    from lucene import Document, TermQuery, Term
    
    # create analyzer
    aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    try:
        file = open('../features.txt', 'r')
        
        featurelist = []
        for line in file.readlines():
            words_in_line = line.split()
            featurelist += words_in_line
             
        querywordlist = query.split()
        
        featureQueryList = []
        productQueryList = []
        
        for word in querywordlist:
            if word in featurelist:
                featureQueryList.append(word)
            else:
                # create parser for word
                aux_parser = QueryParser(Version.LUCENE_CURRENT, "title", aux_analyzer)
                aux_query = aux_parser.parse(word)
                scoreDocs = searcher.search(aux_query, 50).scoreDocs
                if scoreDocs:
                    productQueryList.append(word)

        
        featureQuery = ""
        if featureQueryList:
            featureQuery = "("
            for i in range(len(featureQueryList)):
                if i == len(featureQueryList) - 1:
                    featureQuery += featureQueryList[i] + ")"
                else:
                    featureQuery += featureQueryList[i] + " AND "
                
            print featureQuery
        
        productQuery = ""
        if productQueryList:
            productQuery = "("
            for i in range(len(productQueryList)):
                if i == len(productQueryList) - 1:
                    productQuery += productQueryList[i] + ")"
                else:
                    productQuery += productQueryList[i] + " AND "
            
        return (featureQuery, productQuery, featureQueryList, productQueryList)
    except Exception, ex:
        print "Could not separate feature query words. Reason: ", ex
        return ("", "(" + query + ")", [], querywordlist)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: calculate_emoticon_pmi.py Projeto: vlad43210/emoticons

    def __init__(self, emoticon, searcher, analyzer, english_only=False):
        super(PMICalculator, self).__init__()

        self.field = "emoticons"
        self.emoticon = emoticon
        self.searcher = searcher
        self.analyzer = analyzer
        self.escaped_emoticon = QueryParser.escape(self.emoticon)
        self.query = QueryParser("emoticons", self.analyzer).parse(self.escaped_emoticon)
        self.raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/"
        if english_only:
            country = "United States"
            country_prefix = "US"
        else:
            country = None
            country_prefix = ""
        self.pmi_file_name = (
            self.raw_stats_dir
            + normalizeEmoticonName(self.emoticon).rstrip("_")
            + ("_%s" % (country_prefix)) * english_only
            + ".pmidata"
        )
        self.sample_tweets_name = (
            self.raw_stats_dir
            + normalizeEmoticonName(self.emoticon).rstrip("_")
            + ("_%s" % (country_prefix)) * english_only
            + ".samptweets"
        )
        self.sample_tweets_file = codecs.open(self.sample_tweets_name, encoding="utf-8", mode="w")
        self.term_count_collector = TermCountCollector(searcher, emoticon, country)
        print "starting query at: ", time.time()
        hits = self.searcher.search(self.query, self.term_count_collector)
        # print "terms: ", self.terms
        if emoticon == ":P":
            ee_two = QueryParser.escape(":p")
        elif emoticon == "T_T":
            ee_two = QueryParser.escape("TT")
        elif emoticon == "^_^":
            ee_two = QueryParser.escape("^^")
        if emoticon in [":P", "T_T", "^_^"]:
            q_two = QueryParser("emoticons", self.analyzer).parse(ee_two)
            hits_two = self.searcher.search(q_two, self.term_count_collector)
        self.terms = self.term_count_collector.getTerms()
        self.query_result_count = self.term_count_collector.getDocCount()
        for p_term, p_term_tweets in self.term_count_collector.popular_terms_hash.items():
            for p_term_tweet in p_term_tweets:
                self.sample_tweets_file.write("term: " + p_term + " tweet: " + p_term_tweet + "\n")
        self.sample_tweets_file.close()
        self.base_stats_file = open(
            "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_pmi_stats.txt", "r"
        )
        self.n = int(self.base_stats_file.read().strip().split(":")[1])

        print "computing PMI for query: ", self.emoticon, " at: ", time.time()

        self.p_query_result = self.query_result_count * 1.0 / self.n

Exemplo n.º 16

0

Exibir arquivo

Arquivo: KeywordAnalyzerTest.py Projeto: qiugen/pylucene-trunk

    def testBasicQueryParser(self):

        analyzer = SimpleAnalyzer()
        query = QueryParser(Version.LUCENE_CURRENT, "description",
                            analyzer).parse("partnum:Q36 AND SPACE")

        scoreDocs = self.searcher.search(query, 50).scoreDocs
        self.assertEqual("+partnum:q +space", query.toString("description"),
                         "note Q36 -> q")
        self.assertEqual(0, len(scoreDocs), "doc not found :(")

Exemplo n.º 17

0

Exibir arquivo

Arquivo: QueryParserTest.py Projeto: ustramooner/python-lucenepp

    def testPhraseQuery(self):

        analyzer = StandardAnalyzer(Version.LUCENE_24)
        q = QueryParser(Version.LUCENE_24, "field",
                        analyzer).parse('"This is Some Phrase*"')
        self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed")

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse('"term"')
        self.assert_("TermQuery" == q.getClassName(), "reduced to TermQuery")

Exemplo n.º 18

0

Exibir arquivo

Arquivo: utils.py Projeto: asorici/review-search-engine

def getResultScoreDocs(query):
    # create analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    # create parser for user submitted query
    parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    formatted_query = parser.parse(query)
    scoreDocs = searcher.search(formatted_query, 50).scoreDocs
    
    return scoreDocs

Exemplo n.º 19

0

Exibir arquivo

Arquivo: PositionalPorterStopAnalyzerTest.py Projeto: bpgriner01/pylucene

    def testWithSlop(self):

        searcher = IndexSearcher(self.directory, True)

        parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                             self.porterAnalyzer)
        parser.setPhraseSlop(1)

        query = parser.parse('"over the lazy"')
        topDocs = searcher.search(query, 50)

        self.assertEqual(1, topDocs.totalHits, "hole accounted for")

Exemplo n.º 20

0

Exibir arquivo

Arquivo: do_lucene_search.py Projeto: bethune/disease-ontology

def build_advanced_search_query(params, operator, analyzer):
    """
    Takes a dictionary containing key=value pairs where keys are fields in our
    lucene document and values are search terms provided by the user. A 
    BooleanQuery is built from these key=value pairs
    """
    parser = QueryParser(Version.LUCENE_CURRENT, "name", analyzer)        
    query_list = ["%s:\"%s\"" % (field, process_query_param(val)) 
                                   for (field, val) in 
                                        get_adv_query_packet(params)]

    return parser.parse("%s" % (" " + operator + " ").join(query_list))

Exemplo n.º 21

0

Exibir arquivo

Arquivo: search.py Projeto: mandoju/trabalho_bri_pylucene

def pesquisar_com_lucene():
    initVM()
    #print 'lucene', VERSION

    # Get handle to index directory
    directory = SimpleFSDirectory(File(STORE_DIR))

    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)

    for query in querys:
        query_number =  query.query_number
        # Constructs a query parser. We specify what field to search into.
        query.query_text = query.query_text.replace('?','')
        query.query_text = query.query_text.replace('*','')
        queryParser = QueryParser(Version.LUCENE_CURRENT,
                                  FIELD_CONTENTS, analyzer)

        # Create the query
        query = queryParser.parse(query.query_text)

        # Run the query and get top 50 results
        topDocs = searcher.search(query,50000)

        # Get top hits
        scoreDocs = topDocs.scoreDocs

        r = resultado_query(query_number,scoreDocs)
        resultados.append(r)
        #print "%s total matching documents." % len(scoreDocs)
        #for scoreDoc in scoreDocs:
        #    doc = searcher.doc(scoreDoc.doc)
        #    print doc.get(FIELD_PATH)

    with open('resultados_da_busca/resultados.csv', 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in resultados:
            resultados_da_row = []
            i = 1
            for resultado_da_query in row.query_results:
                doc = searcher.doc(resultado_da_query.doc)
                resultados_da_row.append((i,int(doc.get(FIELD_PATH))))
                i = i + 1
            spamwriter.writerow([row.query_number,resultados_da_row])

Exemplo n.º 22

0

Exibir arquivo

Arquivo: QueryParserTest.py Projeto: bpgriner01/pylucene

    def testDateRangeQuery(self):

        # locale diff between jre and gcj 1/1/04 -> 01/01/04
        # expression = "modified:[1/1/04 TO 12/31/04]"

        expression = "modified:[01/01/04 TO 12/31/04]"
        parser = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer)
        parser.setLocale(Locale.US)
        query = parser.parse(expression)
        print expression, "parsed to", query

        topDocs = self.searcher.search(query, 50)
        self.assert_(topDocs.totalHits > 0)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: QueryParserTest.py Projeto: ustramooner/python-lucenepp

    def testTermRangeQuery(self):

        query = QueryParser(Version.LUCENE_CURRENT, "subject",
                            self.analyzer).parse("title2:[K TO N]")
        self.assert_(query.getClassName() == "TermRangeQuery")

        scoreDocs = self.searcher.search(query, 10).scoreDocs
        self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms")

        query = QueryParser(Version.LUCENE_CURRENT, "subject",
                            self.analyzer).parse("title2:{K TO Mindstorms}")
        scoreDocs = self.searcher.search(query, 10).scoreDocs
        self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms",
                                    True)

Exemplo n.º 24

0

Exibir arquivo

Arquivo: search_files.py Projeto: vlad43210/emoticons

def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return

        print
        print "Searching for:", command
        parsed_command = QueryParser.escape(command)
        query = QueryParser("text", analyzer).parse(parsed_command)
        hits = searcher.search(query)
        print "%s total matching documents." % hits.length()

        try:
            hctr = 0
            for hit in hits:
                hit_id = hits.id(hctr), 
                hit_tv = searcher.getIndexReader().getTermFreqVector(hits.id(hctr), "text")
                trm_str = ""
                for trm in hit_tv.getTerms(): trm_str += " " + trm
                print "term string: ", trm_str.encode("ascii","ignore")
                hctr += 1
                if hctr > hits.length()-2 or hctr > 100: break
                print 'uid:', hit.get("user_id"), 'timestamp: ', hit.get("timestamp"), "country: ", hit.get('country'), "emoticons: ", hit.get('emoticons')
        except Exception, e: 
            print "failed to list hit: ", e

        print
        command = raw_input("Query:")
        parsed_command = QueryParser.escape(command)
        print "Searching for emoticon:", parsed_command
        query = QueryParser("emoticons", analyzer).parse(parsed_command)
        hits = searcher.search(query)
        print "%s total matching documents." % hits.length()

        try:
            hctr = 0
            for hit in hits:
                hit_id = hits.id(hctr), 
                hit_tv = searcher.getIndexReader().getTermFreqVector(hits.id(hctr), "text")
                trm_str = ""
                for trm in hit_tv.getTerms(): trm_str += " " + trm
                print "term string: ", trm_str.encode("ascii","ignore")
                hctr += 1
                if hctr > hits.length()-2 or hctr > 100: break
                print 'uid:', hit.get("user_id"), 'timestamp: ', hit.get("timestamp"), "country: ", hit.get('country'), "emoticons: ", hit.get('emoticons')
        except Exception, e: 
            print "failed to list hit: ", e

Exemplo n.º 25

0

Exibir arquivo

Arquivo: query.py Projeto: tychobismeijer/UvA-AIR

    def query(indexName, queryFile, runName):
        indReader = IndexReader.open(SimpleFSDirectory(File(indexName)))
        indSearcher = IndexSearcher(indReader)
        ir = indSearcher.getIndexReader()

        qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT))

        f = open('results-'+runName, 'w')

        while(True):
            id = queryFile.readline()

            if id == "":
                break

            id = id.replace("C","")
            id = id.replace("\n","")

            queryString = queryFile.readline()
            queryString = queryString.replace("?","")
            queryString = queryString.replace("*","")
            queryString = queryString.replace("-","_")
            queryString = queryString.replace("\n","")

            query = qp.parse(queryString)

            queryFile.readline()

            returnedDocs = 1000
            collector = TopScoreDocCollector.create(returnedDocs, True)

            indSearcher.search(query, collector)

            hits = collector.topDocs().scoreDocs

            size = len(hits)
            print "Total hits for query " +id+ ": "+str(size)

            i = 0
            for hit in hits:        
                docId = hits[i].doc
                score = hits[i].score
                doc = ir.document(docId)
                j = i + 1
                f.write(id + " 0 " + doc.get('id') + " " + str(j) + " " + str(score) +" " + runName +"\n")
                i+=1

        f.close()

Exemplo n.º 26

0

Exibir arquivo

Arquivo: calculate_emoticon_diffusion_bidir.py Projeto: vlad43210/emoticons

def calculateEmoticonDiffusion(emoticon, searcher, analyzer, user_location_hash, usage_threshold = 1, comm_threshold = 1):
    raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/"
    emoticon_stats_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_diffusion_stats.txt","r") 
    total_users = int(emoticon_stats_file.read().strip())
    emoticon_stats_file.close()

    emoticon_file_name = raw_stats_dir + normalizeEmoticonName(emoticon).rstrip('_')+".diffusion_bidir"
    print "Calculating Diffusion for: ", emoticon, " at: ", time.time()
    escaped_emoticon = QueryParser.escape(emoticon)
    query = QueryParser("emoticons", analyzer).parse(escaped_emoticon)
    hits = searcher.search(query)
    print "%s total matching documents." % hits.length()
    if hits.length() == 0: return

    print "compiling diffusion stats at: ", time.time()
    emoticon_users_by_time_hash = {}
    emoticon_users_adopters_hash = {}
    emoticon_users_non_adopters_hash = {}
    users_exposure_hash = {}
    reverse_users_exposure_hash = {}
    try:
        hctr = 0
        for hit in hits:
            hctr += 1
            if hctr%100000==0: print "on hit: ", hctr
            #if hctr > 100000: break
            if hctr == hits.length(): break
            uid, timestamp, country, emoticons, user_id_replied = hit.get("user_id"), int(hit.get("timestamp")), hit.get('country'), hit.get('emoticons'), hit.get('user_id_replied')
            emoticon_users_by_time_hash[uid] = emoticon_users_by_time_hash.get(uid,[])+[timestamp]
    except Exception, e:
        pass

Exemplo n.º 27

0

Exibir arquivo

Arquivo: lucene_index_dir.py Projeto: clintpgeorge/ediscovery

def boolean_search_lucene_index(index_dir, query_text, limit):
    '''
    This function searches a boolean query in the learned lucene index 
    
    Arguments: 
        index_dir - the lucene index directory 
        query_text - the query text which follows http://lucene.apache.org/core/3_6_0/queryparsersyntax.html
        limit - the number of records to be retrieved 
    Return: 
        rows - the returned document details 

    
    '''
    DEFAULT_QUERY_FIELD = 'all'
    
    
    store = SimpleFSDirectory(File(index_dir))
    
    searcher = IndexSearcher(store, True)
    parser = QueryParser(Version.LUCENE_CURRENT, DEFAULT_QUERY_FIELD, STD_ANALYZER)
    query = parser.parse(query_text)
    
    start = datetime.datetime.now()
    scoreDocs = searcher.search(query, limit).scoreDocs
    duration = datetime.datetime.now() - start
    
    # print "Lucene Search: Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

    
    rows = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        table = dict((field.name(), field.stringValue())
                     for field in doc.getFields())
        row = []
        metadata = MetadataType._types
        for field in metadata:
            if table.get(field,'empty') != 'empty' :
                row.append(table.get(field,'empty'))
            else: 
                row.append('')
        row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file 
        row.append(scoreDoc.score)
        
        rows.append(row)
    
    return rows

Exemplo n.º 28

0

Exibir arquivo

Arquivo: AnalysisParalysisTest.py Projeto: ustramooner/python-lucenepp

    def testAnalyzer(self):

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryString = "category:/philosophy/eastern"

        query = QueryParser(Version.LUCENE_CURRENT,
                            "contents", analyzer).parse(queryString)

        self.assertEqual("category:\"philosophy eastern\"",
                         query.toString("contents"), "path got split, yikes!")

        perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer)
        perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer())
        query = QueryParser(Version.LUCENE_CURRENT,
                            "contents", perFieldAnalyzer).parse(queryString)

        self.assertEqual("category:/philosophy/eastern",
                         query.toString("contents"),
                         "leave category field alone")

Exemplo n.º 29

0

Exibir arquivo

Arquivo: QueryParserTest.py Projeto: bpgriner01/pylucene

    def testLowercasing(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse("PrefixQuery*")
        self.assertEqual("prefixquery*", q.toString("field"), "lowercased")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setLowercaseExpandedTerms(False)
        q = qp.parse("PrefixQuery*")
        self.assertEqual("PrefixQuery*", q.toString("field"), "not lowercased")

Exemplo n.º 30

0

Exibir arquivo

Arquivo: QueryParserTest.py Projeto: bpgriner01/pylucene

    def testSlop(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"exact phrase"')
        self.assertEqual('"exact phrase"', q.toString("field"), "zero slop")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setPhraseSlop(5)
        q = qp.parse('"sloppy phrase"')
        self.assertEqual('"sloppy phrase"~5', q.toString("field"), "sloppy, implicitly")

Exemplo n.º 31

0

Exibir arquivo

Arquivo: SearchFiles.py Projeto: zhangcshcn/wse-basic-search-engine

def run(searcher, analyzer, command):
    # print "Searching for:", command
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 10).scoreDocs
    # print "%s total matching documents." % len(scoreDocs)
    rankedfiles = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        # print 'path:', doc.get("path"), 'name:', doc.get("name")
        rankedfiles.append(int(doc.get("name")))
    return rankedfiles

Exemplo n.º 32

0

Exibir arquivo

Arquivo: searchers.py Projeto: myd1/information-retrieval-project

class IndexSearcherWrapper(object):
    def __init__(self, location):
        lucene.initVM()
        directory = SimpleFSDirectory(File(location))
        self.reader = IndexReader.open(directory, True)
        self.searcher = IndexSearcher(self.reader)
        self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text",
                                        WhitespaceAnalyzer())

    def search(self, topic, max=5000):
        query = self.query_parser.parse(topic.title)
        return self.searcher.search(query, max)

Exemplo n.º 33

0

Exibir arquivo

Arquivo: QueryParserTest.py Projeto: lauromoraes/pylucene

    def testPrefixQuery(self):

        parser = QueryParser(Version.LUCENE_CURRENT, "category",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        parser.setLowercaseExpandedTerms(False)

        print parser.parse("/Computers/technology*").toString("category")

Exemplo n.º 34

0

Exibir arquivo

    def LatestSearch(self):
        try:
            searcher = IndexSearcher(self.indexDir)
            today = time.strftime('%Y%m%d')
            keyWord = today.encode('utf8')
            print keyWord
            query = QueryParser(Version.LUCENE_30, "regDate",
                                self.analyzer).parse(keyWord)

            hits = searcher.search(query, 1000)
            return self.__MakeResultFormat(hits, searcher)
        except:
            print 'BookSearcher TotalSearch Exception'

Exemplo n.º 35

0

Exibir arquivo

Arquivo: websearch.py Projeto: AlexChang/ImageSearcher

def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs

Exemplo n.º 36

0

Exibir arquivo

def run(searcher, analyzer, input, filepath):
        #input = raw_input("Query:").decode('gbk').encode('utf8')
	#print "Search for: " + input
	command = convert(input.decode('gbk').encode('utf8'))
	print "Search for:" + command.decode('utf8').encode('gbk')
        qp = QueryParser(Version.LUCENE_CURRENT, "sentence", analyzer)
        #qp.setPhraseSlop(0)
        query = qp.parse(command)
        scoreDocs = searcher.search(query, 1000000).scoreDocs
        print "%s total matching documents." % len(scoreDocs)
        print
        
	try:
		#filepath = "D:\\TotalCode\\PyluceneSample\\Output_pylucene.txt"
		filew = open(filepath, 'w')
		result_num = 0
        	for scoreDoc in scoreDocs:
			try:
				result_num += 1
				if result_num % 1000 == 0:
				#	time.sleep(5)
		    			print "Search added " + str(result_num) + " sentences..."
			#print 'scoreDoc.doc:', scoreDoc.doc
            			doc = searcher.doc(scoreDoc.doc)
	    			path = doc.get("path")
				#print "path:" +  path
			#print 'name:', doc.get("name")
	    		#print 'sentence_num:', str(doc.get("sentence_num"))
	    		#print 'sentence:', doc.get("sentence")
				#sentence = GetSentence(doc.get("sentence_num"), path)
				sentence = doc.get("sentence")
	    			#print 'sentence:', sentence
				OutputSentence(filew, doc.get("name"), sentence)
			except:
				continue
		filew.close()
	except: #Exception, e:
		print "Failed in Outputsentence:"#, e

Exemplo n.º 37

0

Exibir arquivo

def query(query):
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)]
        doc = searcher.doc(hit.doc)

Exemplo n.º 38

0

Exibir arquivo

    def search(cls, indexDir, q):

        fsDir = SimpleFSDirectory(File(indexDir))
        searcher = IndexSearcher(fsDir, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q)
        start = time()
        hits = searcher.search(query, 50).scoreDocs
        duration = timedelta(seconds=time() - start)

        print "Found %d document(s) (in %s) that matched query '%s':" % (
            len(hits), duration, q)

        for hit in hits:
            doc = searcher.doc(hit.doc)
            print 'path:', doc.get("path")

Exemplo n.º 39

0

Exibir arquivo

Arquivo: SearchFiles.py Projeto: lauromoraes/pylucene

def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return

        print
        print "Searching for:", command
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name")

Exemplo n.º 40

0

Exibir arquivo

    def testAnalyzer(self):

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryString = "category:/philosophy/eastern"

        parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        parser.setAutoGeneratePhraseQueries(True)
        query = parser.parse(queryString)

        self.assertEqual("category:\"philosophy eastern\"",
                         query.toString("contents"), "path got split, yikes!")

        perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer)
        perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer())
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            perFieldAnalyzer).parse(queryString)

        self.assertEqual("category:/philosophy/eastern",
                         query.toString("contents"),
                         "leave category field alone")

Exemplo n.º 41

0

Exibir arquivo

    def find(self, query, indir):
        lucene.initVM()
        INDEXDIR = indir

        indir = SimpleFSDirectory(File(INDEXDIR))
        lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
        lucene_searcher = IndexSearcher(indir)
        my_query = QueryParser(Version.LUCENE_30,"<default field>",\
        lucene_analyzer).parse("text:" + query + " OR title:" + query)
        MAX = 1000
        total_hits = lucene_searcher.search(my_query, MAX)
        print "\nHits: ", total_hits.totalHits, "\n"

        for hit in total_hits.scoreDocs:
            print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc(
                hit.doc).get("department").encode(
                    "utf-8"), "Title:", lucene_searcher.doc(
                        hit.doc).get("title").encode("utf-8")
            print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'

Exemplo n.º 42

0

Exibir arquivo

Arquivo: searcher-64.py Projeto: iecnu/bmybbs

def run(searcher, analyzer, querystr):
    query = QueryParser(lucene.Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(querystr)
    MAX = 1000
    hits = searcher.search(query, MAX)

    results = []

    for sd in hits.scoreDocs:
        doc = searcher.doc(sd.doc)
        results.append([
            doc.get("name"),
            doc.get("owner").encode('gbk'),
            doc.get("title").encode('gbk')
        ])

    # sort result
    results.sort(lambda x, y: cmp(x[0], y[0]))
    for name, owner, title in results:
        print name, owner, title

Exemplo n.º 43

0

Exibir arquivo

Arquivo: searcher_qst.py Projeto: elicassion/Zhi-Searcher

def run(searcher, analyzer, command, prior):
    if command == '':
        return

    store = []

    command_dict = parseCommand(command)
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 500000).scoreDocs

    scored = []

    for scoreDoc in scoreDocs:
        returnfile = []
        doc = searcher.doc(scoreDoc.doc)

        if doc.get("qst_num") in scored:
            continue
        if not doc.get("qst_name"):
            continue
        scored.append(doc.get("qst_num"))

        name = doc.get("qst_name").replace(' ', '')
        returnfile.append(name)
        detail = doc.get("qst_detail").replace(' ', '')
        returnfile.append(detail)

        returnfile.append(doc.get("qst_topic_accu"))
        returnfile.append(int(doc.get("qst_browse")))
        returnfile.append(int(doc.get("qst_follow")))
        returnfile.append(int(doc.get("qst_ans")))
        returnfile.append(int(doc.get("qst_num")))

        store.append(returnfile)

    store = storesort(store, prior)
    return store

Exemplo n.º 44

0

Exibir arquivo

def luceneRetriver(query):
    #print ('-------------Searching-------------')
    #print (query)
    lucene.initVM()
    indir = SimpleFSDirectory(File(INDEXDIR))
    lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
    lucene_searcher = IndexSearcher(indir)
    my_query = QueryParser(Version.LUCENE_30, 'text',
                           lucene_analyzer).parse(query)
    MAX = 1000

    #存放返回的文档标题list
    title_list = []

    total_hits = lucene_searcher.search(my_query, MAX)

    #print "Hits: ", total_hits.totalHits

    for hit in total_hits.scoreDocs[:10]:

        #print"Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString()

        doc = lucene_searcher.doc(hit.doc)

        #print doc.get("title").encode("utf-8").lstrip(str(TXTDIR))
        #print doc.get("text").encode("utf-8")
        #print ('\n')

        title_list.append({
            doc.get("title").encode("utf-8").lstrip(str(TXTDIR)):
            round(hit.score, 5)
        }.copy())

    return title_list


#print ('查询内容：八卦')
#print ('查询结果:')
#print ('\n')
#luceneRetriver("下列 关于 中国 八卦 不正确 人类 历史 东西方 平等 交流 见证")

Exemplo n.º 45

0

Exibir arquivo

Arquivo: candidates.py Projeto: Joozty/wikifier

    def find(self, phrase):
        phrase = phrase.lower().encode('utf8')
        query = ' '.join(['+' + word for word in phrase.split(' ')])
        query = QueryParser(self._lversion, 'contents',
                            self._analyzer).parse(query)
        hits = self._searcher.search(query, self.max_candidates)

        # if not hits.totalHits: print "%d documents for '%s'" % (hits.totalHits, str(query)) # potential bug

        # todo put article_id in lucene index instead of translating document title

        links = {}
        for hit in hits.scoreDocs:
            title = quote(
                self._searcher.doc(
                    hit.doc).get("title").encode('utf-8').replace(
                        ' ', '_')).replace('%28', '(').replace('%29', ')')
            if title in self._translation:
                links[self._translation[title]] = hit.score
            # else: print title # potential bug

        return self._links[phrase].get(-1, 0), links

Exemplo n.º 46

0

Exibir arquivo

def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'GBK')
        if command == '':
            return

        command_dict = parseCommand(command)
        sep_command = " ".join(jieba.cut(command_dict['contents']))
        command_dict['contents'] = sep_command
        #print command_dict
        if not command_dict.has_key('site'):
            command = command_dict['contents']
        else:
            command = command_dict['contents'] + " site:" + command_dict['site']
        print
        print "Searching for:", command

        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            print k, v
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print "------------------------"
            #print 'site:', doc.get("site")
            print 'path:', doc.get("path")
            print 'title:', doc.get("title")
            print 'url:', doc.get("url")
            print 'name:', doc.get("name")

Exemplo n.º 47

0

Exibir arquivo

Arquivo: Explainer.py Projeto: lauromoraes/pylucene

    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: Explainer <index dir> <query>"

        else:
            indexDir = argv[1]
            queryExpression = argv[2]

            directory = SimpleFSDirectory(File(indexDir))
            query = QueryParser(Version.LUCENE_CURRENT, "contents",
                                SimpleAnalyzer()).parse(queryExpression)

            print "Query:", queryExpression

            searcher = IndexSearcher(directory)
            scoreDocs = searcher.search(query, 50).scoreDocs

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                explanation = searcher.explain(query, scoreDoc.doc)
                print "----------"
                print doc["title"].encode('utf-8')
                print explanation

Exemplo n.º 48

0

Exibir arquivo

def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:") #输入查询语句
        command = unicode(command, 'GBK')
                        #将查询语句转化为Unicode（注意创建索引时文件也是Unicode）
                        #在Python IDLE下为GBK，在PyScripter-Portable中为UTF8（见Q.ppt）
        if command == '':
            return

        print
        print "Searching for:", command 
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
                        #用analyzer来对查询语句进行词法分析和语言处理。
                        #QueryParser调用parser进行语法分析，形成查询语法树，放到Query中。 
        scoreDocs = searcher.search(query, 50).scoreDocs
                        #IndexSearcher调用search对查询语法树Query进行搜索，得到结果
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name")

Exemplo n.º 49

0

Exibir arquivo

def run(searcher, analyzer, command):
    while True:
        if command == '':
            return
        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 300).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        text = []
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            temptext = [
                doc.get("url"),
                doc.get('title'),
                doc.get("imgurl"),
                doc.get("price"),
                doc.get("kind")
            ]
            text.append(temptext)
        return text

Exemplo n.º 50

0

Exibir arquivo

Arquivo: pylucene_test.py Projeto: danduma/minerva

        writer.addDocument(doc)

    print("Indexed lines from stdin (%d documents in index)" %
          (writer.numDocs()))
    print("About to optimize index of %d documents..." % writer.numDocs())
    writer.optimize()
    print("...done optimizing index of %d documents" % writer.numDocs())
    print("Closing index of %d documents..." % writer.numDocs())
    print("...done closing index of %d documents" % writer.numDocs())
    writer.close()

    # RETRIEVAL

    dir = SimpleFSDirectory(File(fullIndexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text",
                        analyzer).parse(u"¿Dónde está La Mancha?")
    MAX = 1000
    hits = searcher.search(query, MAX)

    print("Found %d document(s) that matched query '%s':" %
          (hits.totalHits, query))

    for hit in hits.scoreDocs:
        print(hit.score, hit.doc, hit.toString())
        doc = searcher.doc(hit.doc)
        print(doc.get("text").encode("utf-8"))
        print(doc.get("metadata").encode("utf-8"))

Exemplo n.º 51

0

Exibir arquivo

		f.write(string)
		f.write(item)

config = ConfigParser()
config.read('config.ini')
r_server = Redis('localhost')
lst = []
search = str(sys.argv[1])
if __name__ == "__main__":
    lucene.initVM()
    indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(search)
    MAX = 1000
    hits = searcher.search(query, MAX)

    #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)

    for hit in hits.scoreDocs:
        if hit.score >= 0.0:
            #print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            #print doc.get("text").encode("utf-8")
            items = doc.get("text").encode("utf-8").split(config.get('counting','delimiter'))
            for item in items:
                if item == search:
                    pass
                elif item not in lst:

Exemplo n.º 52

0

Exibir arquivo

    def post(self):
        q = self.get_argument("query")
        k = self.get_argument("kTerms")

        # self.write(key)

        # def query(query):
        # query = self.get_argument("q")
        lucene.initVM()
        indexDir = "index"
        dir = SimpleFSDirectory(File(indexDir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        searcher = IndexSearcher(dir)

        query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
        MAX = 10
        hits = searcher.search(query, MAX)

        print "Found %d document(s) that matched query '%s':" % (
            hits.totalHits, query)
        items = []
        rQ = []

        #for key, value in doc_urls.iteritems()
        # print (key, value)

        for hit in hits.scoreDocs:
            #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
            print hit.score, hit.doc, hit.toString()
            print(len(doc_urls))
            items.append(doc_urls[str(hit.doc)])
            print(doc_urls[str(hit.doc)])
            doc = searcher.doc(hit.doc)
            print(hit.doc)
            rQ.append("html_files/" + str(hit.doc))

        i = 0
        rqSize = 0
        for url in rQ:
            rqSize = rqSize + 1
            print(url)
            f = codecs.open(url, 'r')
            html = f.read()
            html = html.decode('utf-8')
            tag_free = strip_tags(html)
            path = 'strippedHTML_files'
            if not os.path.exists(path):
                os.makedirs(path)
            filename = str(i)
            with open(os.path.join(path, filename), 'wb') as temp_file:
                temp_file.write(tag_free.encode('utf-8'))
            i = i + 1

        path = 'strippedHTML_files'
        i = 0
        for filename in os.listdir(path):
            with open(os.path.join(path, filename), 'r') as myfile:
                data = myfile.read()
                stripStopWords(data, i)
                i = i + 1
        if k > 0:
            newQuery = calcNewQuery(k, q, rqSize)
            q = newQuery
            print("new query is ")
            print(q)

        self.render("index.html",
                    title="Results",
                    items=items,
                    query=q,
                    kTerms=k)

Exemplo n.º 53

0

Exibir arquivo

Arquivo: mansearch.py Projeto: lauromoraes/pylucene

        format = a
    elif o == "--index":
        indexDir = a
    elif o == "--stats":
        stats = True


class CustomTemplate(Template):
    delimiter = '#'

template = CustomTemplate(format)

fsDir = SimpleFSDirectory(File(indexDir))
searcher = IndexSearcher(fsDir, True)

analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(' '.join(args))
start = datetime.now()
scoreDocs = searcher.search(query, 50).scoreDocs
duration = datetime.now() - start
if stats:
    print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    table = dict((field.name(), field.stringValue())
                 for field in doc.getFields())
    print template.substitute(table)

Exemplo n.º 54

0

Exibir arquivo

Arquivo: retrieve-sentences.py Projeto: afcarl/biased-text-sample

if __name__ == "__main__":
    usedsentences = numpy.zeros((BLOOM_FILTER_SIZE, ), dtype=numpy.bool)
    print >> sys.stderr, "Just created bloom filter with %d entries" % usedsentences.shape[
        0]
    print >> sys.stderr, stats()

    lucene.initVM()
    # create an index called 'index-dir' in a temp directory
    #    indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
    #                            'index-dir')
    #    indexDir = "/Tmp/REMOVEME.index-dir"
    indexDir = "lucene.ukwac"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    queryparser = QueryParser(Version.LUCENE_30, "text", analyzer)
    searcher = IndexSearcher(dir)

    nonzeros = 0

    for i, l in enumerate(sys.stdin):
        if i % 100 == 0:
            print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % (
                i, percent(nonzeros, BLOOM_FILTER_SIZE))
            print >> sys.stderr, stats()
        l = string.strip(l)

        added_this_sentence = 0
        for newl in retrieve(l, searcher, queryparser):
            # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents
            if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT:

Exemplo n.º 55

0

Exibir arquivo

Arquivo: LuceneContainer.py Projeto: Tadashi-Hikari/Chandler-Junkyard

    def searchDocuments(self, view, version, query=None, attribute=None):

        store = self.store

        if query is None:
            query = MatchAllDocsQuery()
        else:
            query = QueryParser("contents", StandardAnalyzer()).parse(query)

        if attribute:
            combinedQuery = BooleanQuery()
            combinedQuery.add(query, BooleanClause.Occur.MUST)
            combinedQuery.add(TermQuery(Term("attribute", attribute.str64())),
                              BooleanClause.Occur.MUST)
            query = combinedQuery

        class _collector(PythonHitCollector):
            def __init__(_self):

                super(_collector, _self).__init__()
                _self.hits = []

            def collect(_self, id, score):

                _self.hits.append((-score, id))

        class _iterator(object):
            def __init__(_self):

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __del__(_self):

                try:
                    if _self.searcher is not None:
                        _self.searcher.close()
                    store.abortTransaction(view, _self.txnStatus)
                except:
                    store.repository.logger.exception("in __del__")

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __iter__(_self):

                _self.txnStatus = store.startTransaction(view)
                _self.searcher = searcher = self.getIndexSearcher()
                _self.collector = _collector()

                searcher.search(query, _self.collector)
                hits = _self.collector.hits

                if hits:
                    heapify(hits)
                    while hits:
                        score, id = heappop(hits)
                        doc = searcher.doc(id)
                        uItem = UUID(doc['item'])

                        if long(doc['version']) <= version:
                            if store._items.isValue(view, version, uItem,
                                                    UUID(doc['value'])):
                                yield uItem, UUID(doc['attribute'])

        return _iterator()

Exemplo n.º 56

0

Exibir arquivo

Arquivo: QueryParserTest.py Projeto: lauromoraes/pylucene

    def testBoost(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse("term^2")
        self.assertEqual("term^2.0", q.toString("field"))

Exemplo n.º 57

0

Exibir arquivo

    'robespierre', 'danton', 'xvi'
]  #, 'marat', 'mirabeau', 'antoinette', 'fayette', 'tyran']#, 'égalité'.decode('utf-8'), 'fraternité'.decode('utf-8'), 'révolution'.decode('utf-8'), 'salut', 'necker', 'napoleon', 'monarchie', 'aristocratie', 'hébert'.decode('utf-8'), 'gironde', 'jacobins', 'feuillants', 'royalistes','royaliste', 'guillotine', 'bastille', 'versailles', 'tuilleries', 'paume', 'constitution', 'etats', 'citoyen', 'democratie']

initVM()
# Get handle to index directory
directory = SimpleFSDirectory(File(STORE_DIR))
# Creates a searcher searching the provided index.
ireader = IndexReader.open(directory, True)
# Implements search over a single IndexReader.
# Use a single instance and use it across queries
# to improve performance.
searcher = IndexSearcher(ireader)
# Get the analyzer
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
# Constructs a query parser. We specify what field to search into.
queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

appearance_dict = {}
for TERM in term_list:
    print 'Searching for: "' + TERM + '"'
    # Create the query
    query = queryParser.parse(TERM)

    # Run the query and get documents that contain the term
    docs_containing_term = searcher.search(query, ireader.numDocs())

    docs = []

    print 'Found ' + str(len(docs_containing_term.scoreDocs)
                         ) + ' documents with the term "' + TERM + '".'
    #hits = searcher.search(query, 1)

Exemplo n.º 58

0

Exibir arquivo

    # Get handle to index directory
    directory = SimpleFSDirectory(File(STORE_DIR))

    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # Constructs a query parser.
    queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

    # Create a query
    query = queryParser.parse(QUERY_STRING)

    topDocs = searcher.search(query, 50)

    # Get top hits
    scoreDocs = topDocs.scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    HighlightFormatter = SimpleHTMLFormatter()
    query_score = QueryScorer (query)

    highlighter = Highlighter(HighlightFormatter, query_score)

Exemplo n.º 59

0

Exibir arquivo

def process_query_param(param):
    """
    Escapes and lowercases all query params for searching in the lucene index.
    """
    processed_param = QueryParser.escape(param)
    return processed_param.lower()

Exemplo n.º 60

0

Exibir arquivo

Arquivo: compare.py Projeto: fabiomen/crawlers-noticias

def main(indexDir, inputDir):
    """Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index"""
    lucene.initVM()

    # Open index
    logger.info("Opening Lucene index [%s]..." % indexDir)
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT)
    searcher = IndexSearcher(dir)

    # Search documents
    onlyfiles = [
        f for f in listdir(inputDir)
        if isfile(join(inputDir, f)) and f.endswith('.json')
    ]
    for f in onlyfiles:
        json_data = open(inputDir + '/' + f)
        data = json.load(json_data)
        # The results collected after comparison
        results = list()

        journal_code = f.split('.')[0]

        for entry in data:
            url = entry['url']
            date = entry['date']
            title = entry['title']

            logger.debug("Processing URL [%s] date [%s] - [%s]" %
                         (url, date, title))

            tt = nltk.word_tokenize(title)
            tokens = []
            for t in tt:
                tokens.append(t.lower())

            entry['similars'] = list()

            for token in tokens:
                q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % (
                    token, date, journal_code, url)
                query = QueryParser(Version.LUCENE_CURRENT, "title",
                                    analyzer).parse(q)
                hits = searcher.search(query, MAX_HITS)

                logger.debug("Found %d document(s) that matched query '%s':" %
                             (hits.totalHits, q))

                for hit in hits.scoreDocs:
                    doc = searcher.doc(hit.doc)
                    logger.debug(doc)
                    entry['similars'].append({
                        'token': token,
                        'url': doc.get('url'),
                        'title': doc.get('title')
                    })

            results.append(entry)
        json_data.close()

        print """<html>
	<body>
	<table><thead>
	<tr>
	<th>Jornal</th><th>Data</th><th>T&iacute;tulo</th><th>URL</th><th>Not&iacute;cias semelhantes</th>
	</tr>
	</thead>
	<tbody>
	"""
        for entry in results:
            similars = entry['similars']
            similars_text = '<ul>'
            for s in similars:
                similars_text += '<li>[%s] [%s] [%s]</li>' % (
                    s['token'].encode('iso-8859-1', 'ignore'),
                    s['title'].encode('iso-8859-1', 'ignore'), s['url'].encode(
                        'iso-8859-1', 'ignore'))
            similars_text += '</ul>'
            print """<tr>
	<td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>
	</tr>
	""" % (journal_code, entry['date'].encode('iso-8859-1', 'ignore'),
            entry['title'].encode('iso-8859-1', 'ignore'), entry['url'].encode(
            'iso-8859-1', 'ignore'), similars_text)