def searchKey(self, key , rank = None):
        query = ""
        try:
            MAX = 100000
            qp = QueryParser(Version.LUCENE_35, "key", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(key)
#             print ("query",query)
                        
            hits = searcher.search(query, MAX)

            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                try:
                    sentence_list.append(eval(doc.get("sentence").encode("utf-8")))
                except:
                    print doc.get("sentence")
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+key)
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []
Пример #2
0
def search(r, keyword=""):
    import logging

    logger = logging.getLogger("search")
    bench = Benchmark(logger)
    from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit
    import lucene, os

    os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17"
    lucene.initVM(lucene.CLASSPATH)

    directory = FSDirectory.open(File(CONFIG.INDEX_PATH))
    ROBOT_INDEX = IndexSearcher(directory, True)
    ROBOT_ANALYZER = StandardAnalyzer()

    keyword = keyword or r.GET["keyword"]
    query = QueryParser("context", ROBOT_ANALYZER)
    query = query.parse('"%s"' % keyword)

    bench.start_mark("search")
    hits = ROBOT_INDEX.search(query)
    count = len(hits)
    result = []
    i = 0
    for hit in hits:
        i += 1
        if i > 100:
            break
        doc = Hit.cast_(hit).getDocument()
        result.append(SearchResult(doc, i, keyword))
    ROBOT_INDEX.close()

    et = bench.stop_mark()

    return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
Пример #3
0
    def query(indexName, queryString):

        indSearcher = IndexSearcher(SimpleFSDirectory(File(indexName)))
        qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT))
        qp.setDefaultOperator(qp.Operator.AND)
         
        query = qp.parse(queryString.replace("-","_"))
                
        aux = indSearcher.search(query, 100)
        results = aux.scoreDocs
        hits = aux.totalHits
        
        ir = indSearcher.getIndexReader()

        #results = collector.topDocs()
        i = 0

        res = []
    
        for r in results:        
            doc = ir.document(i)
            res.insert(i, doc.get('id'))
            i+=1
            
        return res
    def testPrefixQuery(self):

        parser = QueryParser(Version.LUCENE_CURRENT, "category",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        parser.setLowercaseExpandedTerms(False)

        print parser.parse("/Computers/technology*").toString("category")
Пример #5
0
    def search(self, string ,special = None):
        query = ""
        try:
            MAX = 100000
            #for dates such as 1931.08.06
            string = string.replace("."," ")
            
            array = re.findall(r'[\w\s]+',string)
            string = ""
            for item in array:
                string+=item
            qp = QueryParser(Version.LUCENE_35, "title", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(string)
#            print ("query",query)
                        
            hits = searcher.search(query, MAX)
            
            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                sentence_list.append(doc.get("title").encode("utf-8"))
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+string)
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []
Пример #6
0
    def searchXYPair(self,x,y):
        """
        Returns all sentences, which are tagged with the given two entities (x,y)
        """
        tmp_hm = {}
        if x == "" or y == "":
            return []
        try:
            array = re.findall(r'[\w\s]+',x)
            x = ""
            for item in array:
                x+=item
            qp = QueryParser(Version.LUCENE_35, "X", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(x)
            MAX = 100000
            result_list = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                y_entry = doc["Y"]
                if y_entry == y:
                    tmp_hm[doc["Sentence"]]=""
                    
            for key in tmp_hm:
                result_list.append(IndexUtils.sentence_wrapper(key))
            tmp_hm = {}
            return result_list
        except:
            print("Fail (search XYPair) in x:"+x+" y:"+y)
            print "Unexpected error:", sys.exc_info()[0]
            print

            
        return []
    def searchForDbpediaURI(self, uri):
        """
        Returns all anchor texts, which are related to the given DBpedia URI.
        Also returns for each anchor text the corresponding URI and the number of how often the anchor appears on the english Wikipedia
        """
        uri_old = uri
        uri = uri.replace("http://dbpedia.org/resource/","")

        array = re.findall(r'[\w\s]+',uri)
        uri = ""
        for item in array:
            uri+=item
        
        try:
            qp = QueryParser(Version.LUCENE_35, "dbpedia_uri", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(uri)
            MAX = 10000
            result = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                dbpedia_uri = doc["dbpedia_uri"].encode("utf-8")
                if dbpedia_uri == uri_old:
                    result.append([doc["anchor"].encode("utf-8"), doc["anchor_uri"].encode("utf-8"), dbpedia_uri, doc["number"].encode("utf-8")])
            return result
        except:
            print("searchForDbpediaURI - Fail in uri: "+uri)
            return []
Пример #8
0
    def searchForDbpediaURI(self, uri):
        """
        Returns all sentences, which are tagged with the given DBpedia URI
        """
        print "in searchForDbpediaURI" 
        uri_old = uri
        uri = uri.replace("http://dbpedia.org/ontology/","")
        uri = uri.replace("http://dbpedia.org/property/","")
        uri = uri.replace("http://dbpedia.org/resource/","")

        array = re.findall(r'[\w\s]+',uri)
        uri = ""
        for item in array:
            uri+=item
        
        try:
            qp = QueryParser(Version.LUCENE_35, "URI", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(uri)
            print "query: "+str(query)
            MAX = 500000
            result = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                dbpedia_uri = doc["URI"]
                if dbpedia_uri == uri_old:
                    result.append([IndexUtils.sentence_wrapper(doc["Sentence"]), doc["X"], doc["Y"],dbpedia_uri])
            return result
        except:
            print("Fail in uri: "+uri)
            print "Unexpected error:", sys.exc_info()[0]
            return result
Пример #9
0
 def does_line_existNew(self,line,x,y):
     """
     Checks, if parsed sentence already exists in index
     """
     query = ""
     try:
         array = re.findall(r'[\w]+',line)
         string = ""
         for item in array:
             string+=item+" "
         qp = QueryParser(Version.LUCENE_35, "Sentence", analyzer)
         qp.setDefaultOperator(qp.Operator.AND)
         query = qp.parse(string)
         
         MAX = 10
         hits = searcher.search(query, MAX)
         if len(hits.scoreDocs)>0:
             return True
         else:
             return False
     except Exception:
         s_tmp =  str(sys.exc_info())
         if "too many boolean clauses" in s_tmp:
             print "too many boolean clauses"
             """
             Returns true, so that the sentence is not added each time, to avoid further error messages.
             Only occours with very large sentences.
             """
             return True
         else:
             print "Unexpected error:", sys.exc_info()[0]
             print "in does line exist"
             print s_tmp
     return False
    def searchString(self, string):
        'searches for a string and returns an array of POS-tagged sentences'
        query = ""
        #print("Input String: ",string)
        try:
            MAX = 100000
            #for dates such as 1931.08.06
            string = string.replace("."," ")
            
            array = re.findall(r'[\w\s]+',string)
            string = ""
            for item in array:
                string+=item
            #print("Input String2: ",string)
            qp = QueryParser(Version.LUCENE_35, "sentence", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(string)
            #print ("query",query)
                        
            hits = searcher.search(query, MAX)
            #print len(hits)
            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                #print doc.get("sentence")
                sentence_list.append(eval(doc.get("sentence").encode("utf-8")))
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+string+" in search term")
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []
Пример #11
0
    def main(cls, argv):

        allBooks = MatchAllDocsQuery()
        parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        query = BooleanQuery()
        query.add(allBooks, BooleanClause.Occur.SHOULD)
        query.add(parser.parse("java OR action"), BooleanClause.Occur.SHOULD)

        indexDir = System.getProperty("index.dir")
        directory = SimpleFSDirectory(File(indexDir))

        example = SortingExample(directory)

        example.displayResults(query, Sort.RELEVANCE)
        example.displayResults(query, Sort.INDEXORDER)
        example.displayResults(query,
                               Sort(SortField("category", SortField.STRING)))
        example.displayResults(query,
                               Sort(SortField("pubmonth", SortField.INT, True)))

        example.displayResults(query,
                               Sort([SortField("category", SortField.STRING),
                                     SortField.FIELD_SCORE,
                                     SortField("pubmonth", SortField.INT, True)]))

        example.displayResults(query,
                               Sort([SortField.FIELD_SCORE,
                                     SortField("category", SortField.STRING)]))
        directory.close()
Пример #12
0
    def testPhraseQuery(self):

        analyzer = StandardAnalyzer(Version.LUCENE_24)
        q = QueryParser(Version.LUCENE_24, "field", analyzer).parse('"This is Some Phrase*"')
        self.assertEqual('"some phrase"', q.toString("field"), "analyzed")

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"term"')
        self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
Пример #13
0
    def main(cls):

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            cls.synonymAnalyzer).parse('"fox jumps"')
        print "\"fox jumps\" parses to ", query.toString("content")

        print "From AnalyzerUtils.tokensFromAnalysis: "
        AnalyzerUtils.displayTokens(cls.synonymAnalyzer, "\"fox jumps\"")
        print ''
Пример #14
0
def extractFeatureQueryWords(query):
    import string
    from lucene import Document, TermQuery, Term
    
    # create analyzer
    aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    try:
        file = open('../features.txt', 'r')
        
        featurelist = []
        for line in file.readlines():
            words_in_line = line.split()
            featurelist += words_in_line
             
        querywordlist = query.split()
        
        featureQueryList = []
        productQueryList = []
        
        for word in querywordlist:
            if word in featurelist:
                featureQueryList.append(word)
            else:
                # create parser for word
                aux_parser = QueryParser(Version.LUCENE_CURRENT, "title", aux_analyzer)
                aux_query = aux_parser.parse(word)
                scoreDocs = searcher.search(aux_query, 50).scoreDocs
                if scoreDocs:
                    productQueryList.append(word)

        
        featureQuery = ""
        if featureQueryList:
            featureQuery = "("
            for i in range(len(featureQueryList)):
                if i == len(featureQueryList) - 1:
                    featureQuery += featureQueryList[i] + ")"
                else:
                    featureQuery += featureQueryList[i] + " AND "
                
            print featureQuery
        
        productQuery = ""
        if productQueryList:
            productQuery = "("
            for i in range(len(productQueryList)):
                if i == len(productQueryList) - 1:
                    productQuery += productQueryList[i] + ")"
                else:
                    productQuery += productQueryList[i] + " AND "
            
        return (featureQuery, productQuery, featureQueryList, productQueryList)
    except Exception, ex:
        print "Could not separate feature query words. Reason: ", ex
        return ("", "(" + query + ")", [], querywordlist)
Пример #15
0
    def __init__(self, emoticon, searcher, analyzer, english_only=False):
        super(PMICalculator, self).__init__()

        self.field = "emoticons"
        self.emoticon = emoticon
        self.searcher = searcher
        self.analyzer = analyzer
        self.escaped_emoticon = QueryParser.escape(self.emoticon)
        self.query = QueryParser("emoticons", self.analyzer).parse(self.escaped_emoticon)
        self.raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/"
        if english_only:
            country = "United States"
            country_prefix = "US"
        else:
            country = None
            country_prefix = ""
        self.pmi_file_name = (
            self.raw_stats_dir
            + normalizeEmoticonName(self.emoticon).rstrip("_")
            + ("_%s" % (country_prefix)) * english_only
            + ".pmidata"
        )
        self.sample_tweets_name = (
            self.raw_stats_dir
            + normalizeEmoticonName(self.emoticon).rstrip("_")
            + ("_%s" % (country_prefix)) * english_only
            + ".samptweets"
        )
        self.sample_tweets_file = codecs.open(self.sample_tweets_name, encoding="utf-8", mode="w")
        self.term_count_collector = TermCountCollector(searcher, emoticon, country)
        print "starting query at: ", time.time()
        hits = self.searcher.search(self.query, self.term_count_collector)
        # print "terms: ", self.terms
        if emoticon == ":P":
            ee_two = QueryParser.escape(":p")
        elif emoticon == "T_T":
            ee_two = QueryParser.escape("TT")
        elif emoticon == "^_^":
            ee_two = QueryParser.escape("^^")
        if emoticon in [":P", "T_T", "^_^"]:
            q_two = QueryParser("emoticons", self.analyzer).parse(ee_two)
            hits_two = self.searcher.search(q_two, self.term_count_collector)
        self.terms = self.term_count_collector.getTerms()
        self.query_result_count = self.term_count_collector.getDocCount()
        for p_term, p_term_tweets in self.term_count_collector.popular_terms_hash.items():
            for p_term_tweet in p_term_tweets:
                self.sample_tweets_file.write("term: " + p_term + " tweet: " + p_term_tweet + "\n")
        self.sample_tweets_file.close()
        self.base_stats_file = open(
            "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_pmi_stats.txt", "r"
        )
        self.n = int(self.base_stats_file.read().strip().split(":")[1])

        print "computing PMI for query: ", self.emoticon, " at: ", time.time()

        self.p_query_result = self.query_result_count * 1.0 / self.n
Пример #16
0
    def testBasicQueryParser(self):

        analyzer = SimpleAnalyzer()
        query = QueryParser(Version.LUCENE_CURRENT, "description",
                            analyzer).parse("partnum:Q36 AND SPACE")

        scoreDocs = self.searcher.search(query, 50).scoreDocs
        self.assertEqual("+partnum:q +space", query.toString("description"),
                         "note Q36 -> q")
        self.assertEqual(0, len(scoreDocs), "doc not found :(")
Пример #17
0
    def testPhraseQuery(self):

        analyzer = StandardAnalyzer(Version.LUCENE_24)
        q = QueryParser(Version.LUCENE_24, "field",
                        analyzer).parse('"This is Some Phrase*"')
        self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed")

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse('"term"')
        self.assert_("TermQuery" == q.getClassName(), "reduced to TermQuery")
Пример #18
0
def getResultScoreDocs(query):
    # create analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    # create parser for user submitted query
    parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    formatted_query = parser.parse(query)
    scoreDocs = searcher.search(formatted_query, 50).scoreDocs
    
    return scoreDocs
    def testWithSlop(self):

        searcher = IndexSearcher(self.directory, True)

        parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                             self.porterAnalyzer)
        parser.setPhraseSlop(1)

        query = parser.parse('"over the lazy"')
        topDocs = searcher.search(query, 50)

        self.assertEqual(1, topDocs.totalHits, "hole accounted for")
Пример #20
0
def build_advanced_search_query(params, operator, analyzer):
    """
    Takes a dictionary containing key=value pairs where keys are fields in our
    lucene document and values are search terms provided by the user. A 
    BooleanQuery is built from these key=value pairs
    """
    parser = QueryParser(Version.LUCENE_CURRENT, "name", analyzer)        
    query_list = ["%s:\"%s\"" % (field, process_query_param(val)) 
                                   for (field, val) in 
                                        get_adv_query_packet(params)]

    return parser.parse("%s" % (" " + operator + " ").join(query_list))        
Пример #21
0
def pesquisar_com_lucene():
    initVM()
    #print 'lucene', VERSION

    # Get handle to index directory
    directory = SimpleFSDirectory(File(STORE_DIR))

    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)

    for query in querys:
        query_number =  query.query_number
        # Constructs a query parser. We specify what field to search into.
        query.query_text = query.query_text.replace('?','')
        query.query_text = query.query_text.replace('*','')
        queryParser = QueryParser(Version.LUCENE_CURRENT,
                                  FIELD_CONTENTS, analyzer)

        # Create the query
        query = queryParser.parse(query.query_text)

        # Run the query and get top 50 results
        topDocs = searcher.search(query,50000)

        # Get top hits
        scoreDocs = topDocs.scoreDocs

        r = resultado_query(query_number,scoreDocs)
        resultados.append(r)
        #print "%s total matching documents." % len(scoreDocs)
        #for scoreDoc in scoreDocs:
        #    doc = searcher.doc(scoreDoc.doc)
        #    print doc.get(FIELD_PATH)

    with open('resultados_da_busca/resultados.csv', 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in resultados:
            resultados_da_row = []
            i = 1
            for resultado_da_query in row.query_results:
                doc = searcher.doc(resultado_da_query.doc)
                resultados_da_row.append((i,int(doc.get(FIELD_PATH))))
                i = i + 1
            spamwriter.writerow([row.query_number,resultados_da_row])
Пример #22
0
    def testDateRangeQuery(self):

        # locale diff between jre and gcj 1/1/04 -> 01/01/04
        # expression = "modified:[1/1/04 TO 12/31/04]"

        expression = "modified:[01/01/04 TO 12/31/04]"
        parser = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer)
        parser.setLocale(Locale.US)
        query = parser.parse(expression)
        print expression, "parsed to", query

        topDocs = self.searcher.search(query, 50)
        self.assert_(topDocs.totalHits > 0)
Пример #23
0
    def testTermRangeQuery(self):

        query = QueryParser(Version.LUCENE_CURRENT, "subject",
                            self.analyzer).parse("title2:[K TO N]")
        self.assert_(query.getClassName() == "TermRangeQuery")

        scoreDocs = self.searcher.search(query, 10).scoreDocs
        self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms")

        query = QueryParser(Version.LUCENE_CURRENT, "subject",
                            self.analyzer).parse("title2:{K TO Mindstorms}")
        scoreDocs = self.searcher.search(query, 10).scoreDocs
        self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms",
                                    True)
Пример #24
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return

        print
        print "Searching for:", command
        parsed_command = QueryParser.escape(command)
        query = QueryParser("text", analyzer).parse(parsed_command)
        hits = searcher.search(query)
        print "%s total matching documents." % hits.length()

        try:
            hctr = 0
            for hit in hits:
                hit_id = hits.id(hctr), 
                hit_tv = searcher.getIndexReader().getTermFreqVector(hits.id(hctr), "text")
                trm_str = ""
                for trm in hit_tv.getTerms(): trm_str += " " + trm
                print "term string: ", trm_str.encode("ascii","ignore")
                hctr += 1
                if hctr > hits.length()-2 or hctr > 100: break
                print 'uid:', hit.get("user_id"), 'timestamp: ', hit.get("timestamp"), "country: ", hit.get('country'), "emoticons: ", hit.get('emoticons')
        except Exception, e: 
            print "failed to list hit: ", e

        print
        command = raw_input("Query:")
        parsed_command = QueryParser.escape(command)
        print "Searching for emoticon:", parsed_command
        query = QueryParser("emoticons", analyzer).parse(parsed_command)
        hits = searcher.search(query)
        print "%s total matching documents." % hits.length()

        try:
            hctr = 0
            for hit in hits:
                hit_id = hits.id(hctr), 
                hit_tv = searcher.getIndexReader().getTermFreqVector(hits.id(hctr), "text")
                trm_str = ""
                for trm in hit_tv.getTerms(): trm_str += " " + trm
                print "term string: ", trm_str.encode("ascii","ignore")
                hctr += 1
                if hctr > hits.length()-2 or hctr > 100: break
                print 'uid:', hit.get("user_id"), 'timestamp: ', hit.get("timestamp"), "country: ", hit.get('country'), "emoticons: ", hit.get('emoticons')
        except Exception, e: 
            print "failed to list hit: ", e
Пример #25
0
    def query(indexName, queryFile, runName):
        indReader = IndexReader.open(SimpleFSDirectory(File(indexName)))
        indSearcher = IndexSearcher(indReader)
        ir = indSearcher.getIndexReader()

        qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT))

        f = open('results-'+runName, 'w')

        while(True):
            id = queryFile.readline()

            if id == "":
                break

            id = id.replace("C","")
            id = id.replace("\n","")

            queryString = queryFile.readline()
            queryString = queryString.replace("?","")
            queryString = queryString.replace("*","")
            queryString = queryString.replace("-","_")
            queryString = queryString.replace("\n","")

            query = qp.parse(queryString)

            queryFile.readline()

            returnedDocs = 1000
            collector = TopScoreDocCollector.create(returnedDocs, True)

            indSearcher.search(query, collector)

            hits = collector.topDocs().scoreDocs

            size = len(hits)
            print "Total hits for query " +id+ ": "+str(size)

            i = 0
            for hit in hits:        
                docId = hits[i].doc
                score = hits[i].score
                doc = ir.document(docId)
                j = i + 1
                f.write(id + " 0 " + doc.get('id') + " " + str(j) + " " + str(score) +" " + runName +"\n")
                i+=1

        f.close()
def calculateEmoticonDiffusion(emoticon, searcher, analyzer, user_location_hash, usage_threshold = 1, comm_threshold = 1):
    raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/"
    emoticon_stats_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_diffusion_stats.txt","r") 
    total_users = int(emoticon_stats_file.read().strip())
    emoticon_stats_file.close()

    emoticon_file_name = raw_stats_dir + normalizeEmoticonName(emoticon).rstrip('_')+".diffusion_bidir"
    print "Calculating Diffusion for: ", emoticon, " at: ", time.time()
    escaped_emoticon = QueryParser.escape(emoticon)
    query = QueryParser("emoticons", analyzer).parse(escaped_emoticon)
    hits = searcher.search(query)
    print "%s total matching documents." % hits.length()
    if hits.length() == 0: return

    print "compiling diffusion stats at: ", time.time()
    emoticon_users_by_time_hash = {}
    emoticon_users_adopters_hash = {}
    emoticon_users_non_adopters_hash = {}
    users_exposure_hash = {}
    reverse_users_exposure_hash = {}
    try:
        hctr = 0
        for hit in hits:
            hctr += 1
            if hctr%100000==0: print "on hit: ", hctr
            #if hctr > 100000: break
            if hctr == hits.length(): break
            uid, timestamp, country, emoticons, user_id_replied = hit.get("user_id"), int(hit.get("timestamp")), hit.get('country'), hit.get('emoticons'), hit.get('user_id_replied')
            emoticon_users_by_time_hash[uid] = emoticon_users_by_time_hash.get(uid,[])+[timestamp]
    except Exception, e:
        pass
Пример #27
0
def boolean_search_lucene_index(index_dir, query_text, limit):
    '''
    This function searches a boolean query in the learned lucene index 
    
    Arguments: 
        index_dir - the lucene index directory 
        query_text - the query text which follows http://lucene.apache.org/core/3_6_0/queryparsersyntax.html
        limit - the number of records to be retrieved 
    Return: 
        rows - the returned document details 

    
    '''
    DEFAULT_QUERY_FIELD = 'all'
    
    
    store = SimpleFSDirectory(File(index_dir))
    
    searcher = IndexSearcher(store, True)
    parser = QueryParser(Version.LUCENE_CURRENT, DEFAULT_QUERY_FIELD, STD_ANALYZER)
    query = parser.parse(query_text)
    
    start = datetime.datetime.now()
    scoreDocs = searcher.search(query, limit).scoreDocs
    duration = datetime.datetime.now() - start
    
    # print "Lucene Search: Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

    
    rows = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        table = dict((field.name(), field.stringValue())
                     for field in doc.getFields())
        row = []
        metadata = MetadataType._types
        for field in metadata:
            if table.get(field,'empty') != 'empty' :
                row.append(table.get(field,'empty'))
            else: 
                row.append('')
        row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file 
        row.append(scoreDoc.score)
        
        rows.append(row)
    
    return rows
    def testAnalyzer(self):

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryString = "category:/philosophy/eastern"

        query = QueryParser(Version.LUCENE_CURRENT,
                            "contents", analyzer).parse(queryString)

        self.assertEqual("category:\"philosophy eastern\"",
                         query.toString("contents"), "path got split, yikes!")

        perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer)
        perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer())
        query = QueryParser(Version.LUCENE_CURRENT,
                            "contents", perFieldAnalyzer).parse(queryString)

        self.assertEqual("category:/philosophy/eastern",
                         query.toString("contents"),
                         "leave category field alone")
Пример #29
0
    def testLowercasing(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse("PrefixQuery*")
        self.assertEqual("prefixquery*", q.toString("field"), "lowercased")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setLowercaseExpandedTerms(False)
        q = qp.parse("PrefixQuery*")
        self.assertEqual("PrefixQuery*", q.toString("field"), "not lowercased")
Пример #30
0
    def testSlop(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"exact phrase"')
        self.assertEqual('"exact phrase"', q.toString("field"), "zero slop")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setPhraseSlop(5)
        q = qp.parse('"sloppy phrase"')
        self.assertEqual('"sloppy phrase"~5', q.toString("field"), "sloppy, implicitly")
def run(searcher, analyzer, command):
    # print "Searching for:", command
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 10).scoreDocs
    # print "%s total matching documents." % len(scoreDocs)
    rankedfiles = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        # print 'path:', doc.get("path"), 'name:', doc.get("name")
        rankedfiles.append(int(doc.get("name")))
    return rankedfiles
class IndexSearcherWrapper(object):
    def __init__(self, location):
        lucene.initVM()
        directory = SimpleFSDirectory(File(location))
        self.reader = IndexReader.open(directory, True)
        self.searcher = IndexSearcher(self.reader)
        self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text",
                                        WhitespaceAnalyzer())

    def search(self, topic, max=5000):
        query = self.query_parser.parse(topic.title)
        return self.searcher.search(query, max)
Пример #33
0
    def testPrefixQuery(self):

        parser = QueryParser(Version.LUCENE_CURRENT, "category",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        parser.setLowercaseExpandedTerms(False)

        print parser.parse("/Computers/technology*").toString("category")
Пример #34
0
    def LatestSearch(self):
        try:
            searcher = IndexSearcher(self.indexDir)
            today = time.strftime('%Y%m%d')
            keyWord = today.encode('utf8')
            print keyWord
            query = QueryParser(Version.LUCENE_30, "regDate",
                                self.analyzer).parse(keyWord)

            hits = searcher.search(query, 1000)
            return self.__MakeResultFormat(hits, searcher)
        except:
            print 'BookSearcher TotalSearch Exception'
Пример #35
0
def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs
Пример #36
0
def run(searcher, analyzer, input, filepath):
        #input = raw_input("Query:").decode('gbk').encode('utf8')
	#print "Search for: " + input
	command = convert(input.decode('gbk').encode('utf8'))
	print "Search for:" + command.decode('utf8').encode('gbk')
        qp = QueryParser(Version.LUCENE_CURRENT, "sentence", analyzer)
        #qp.setPhraseSlop(0)
        query = qp.parse(command)
        scoreDocs = searcher.search(query, 1000000).scoreDocs
        print "%s total matching documents." % len(scoreDocs)
        print
        
	try:
		#filepath = "D:\\TotalCode\\PyluceneSample\\Output_pylucene.txt"
		filew = open(filepath, 'w')
		result_num = 0
        	for scoreDoc in scoreDocs:
			try:
				result_num += 1
				if result_num % 1000 == 0:
				#	time.sleep(5)
		    			print "Search added " + str(result_num) + " sentences..."
			#print 'scoreDoc.doc:', scoreDoc.doc
            			doc = searcher.doc(scoreDoc.doc)
	    			path = doc.get("path")
				#print "path:" +  path
			#print 'name:', doc.get("name")
	    		#print 'sentence_num:', str(doc.get("sentence_num"))
	    		#print 'sentence:', doc.get("sentence")
				#sentence = GetSentence(doc.get("sentence_num"), path)
				sentence = doc.get("sentence")
	    			#print 'sentence:', sentence
				OutputSentence(filew, doc.get("name"), sentence)
			except:
				continue
		filew.close()
	except: #Exception, e:
		print "Failed in Outputsentence:"#, e
Пример #37
0
def query(query):
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)]
        doc = searcher.doc(hit.doc)
Пример #38
0
    def search(cls, indexDir, q):

        fsDir = SimpleFSDirectory(File(indexDir))
        searcher = IndexSearcher(fsDir, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q)
        start = time()
        hits = searcher.search(query, 50).scoreDocs
        duration = timedelta(seconds=time() - start)

        print "Found %d document(s) (in %s) that matched query '%s':" % (
            len(hits), duration, q)

        for hit in hits:
            doc = searcher.doc(hit.doc)
            print 'path:', doc.get("path")
Пример #39
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return

        print
        print "Searching for:", command
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name")
Пример #40
0
    def testAnalyzer(self):

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryString = "category:/philosophy/eastern"

        parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        parser.setAutoGeneratePhraseQueries(True)
        query = parser.parse(queryString)

        self.assertEqual("category:\"philosophy eastern\"",
                         query.toString("contents"), "path got split, yikes!")

        perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer)
        perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer())
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            perFieldAnalyzer).parse(queryString)

        self.assertEqual("category:/philosophy/eastern",
                         query.toString("contents"),
                         "leave category field alone")
Пример #41
0
    def find(self, query, indir):
        lucene.initVM()
        INDEXDIR = indir

        indir = SimpleFSDirectory(File(INDEXDIR))
        lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
        lucene_searcher = IndexSearcher(indir)
        my_query = QueryParser(Version.LUCENE_30,"<default field>",\
        lucene_analyzer).parse("text:" + query + " OR title:" + query)
        MAX = 1000
        total_hits = lucene_searcher.search(my_query, MAX)
        print "\nHits: ", total_hits.totalHits, "\n"

        for hit in total_hits.scoreDocs:
            print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc(
                hit.doc).get("department").encode(
                    "utf-8"), "Title:", lucene_searcher.doc(
                        hit.doc).get("title").encode("utf-8")
            print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'
Пример #42
0
def run(searcher, analyzer, querystr):
    query = QueryParser(lucene.Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(querystr)
    MAX = 1000
    hits = searcher.search(query, MAX)

    results = []

    for sd in hits.scoreDocs:
        doc = searcher.doc(sd.doc)
        results.append([
            doc.get("name"),
            doc.get("owner").encode('gbk'),
            doc.get("title").encode('gbk')
        ])

    # sort result
    results.sort(lambda x, y: cmp(x[0], y[0]))
    for name, owner, title in results:
        print name, owner, title
Пример #43
0
def run(searcher, analyzer, command, prior):
    if command == '':
        return

    store = []

    command_dict = parseCommand(command)
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 500000).scoreDocs

    scored = []

    for scoreDoc in scoreDocs:
        returnfile = []
        doc = searcher.doc(scoreDoc.doc)

        if doc.get("qst_num") in scored:
            continue
        if not doc.get("qst_name"):
            continue
        scored.append(doc.get("qst_num"))

        name = doc.get("qst_name").replace(' ', '')
        returnfile.append(name)
        detail = doc.get("qst_detail").replace(' ', '')
        returnfile.append(detail)

        returnfile.append(doc.get("qst_topic_accu"))
        returnfile.append(int(doc.get("qst_browse")))
        returnfile.append(int(doc.get("qst_follow")))
        returnfile.append(int(doc.get("qst_ans")))
        returnfile.append(int(doc.get("qst_num")))

        store.append(returnfile)

    store = storesort(store, prior)
    return store
Пример #44
0
def luceneRetriver(query):
    #print ('-------------Searching-------------')
    #print (query)
    lucene.initVM()
    indir = SimpleFSDirectory(File(INDEXDIR))
    lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
    lucene_searcher = IndexSearcher(indir)
    my_query = QueryParser(Version.LUCENE_30, 'text',
                           lucene_analyzer).parse(query)
    MAX = 1000

    #存放返回的文档标题list
    title_list = []

    total_hits = lucene_searcher.search(my_query, MAX)

    #print "Hits: ", total_hits.totalHits

    for hit in total_hits.scoreDocs[:10]:

        #print"Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString()

        doc = lucene_searcher.doc(hit.doc)

        #print doc.get("title").encode("utf-8").lstrip(str(TXTDIR))
        #print doc.get("text").encode("utf-8")
        #print ('\n')

        title_list.append({
            doc.get("title").encode("utf-8").lstrip(str(TXTDIR)):
            round(hit.score, 5)
        }.copy())

    return title_list


#print ('查询内容:八卦')
#print ('查询结果:')
#print ('\n')
#luceneRetriver("下列 关于 中国 八卦 不正确 人类 历史 东西方 平等 交流 见证")
Пример #45
0
    def find(self, phrase):
        phrase = phrase.lower().encode('utf8')
        query = ' '.join(['+' + word for word in phrase.split(' ')])
        query = QueryParser(self._lversion, 'contents',
                            self._analyzer).parse(query)
        hits = self._searcher.search(query, self.max_candidates)

        # if not hits.totalHits: print "%d documents for '%s'" % (hits.totalHits, str(query)) # potential bug

        # todo put article_id in lucene index instead of translating document title

        links = {}
        for hit in hits.scoreDocs:
            title = quote(
                self._searcher.doc(
                    hit.doc).get("title").encode('utf-8').replace(
                        ' ', '_')).replace('%28', '(').replace('%29', ')')
            if title in self._translation:
                links[self._translation[title]] = hit.score
            # else: print title # potential bug

        return self._links[phrase].get(-1, 0), links
Пример #46
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'GBK')
        if command == '':
            return

        command_dict = parseCommand(command)
        sep_command = " ".join(jieba.cut(command_dict['contents']))
        command_dict['contents'] = sep_command
        #print command_dict
        if not command_dict.has_key('site'):
            command = command_dict['contents']
        else:
            command = command_dict['contents'] + " site:" + command_dict['site']
        print
        print "Searching for:", command

        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            print k, v
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print "------------------------"
            #print 'site:', doc.get("site")
            print 'path:', doc.get("path")
            print 'title:', doc.get("title")
            print 'url:', doc.get("url")
            print 'name:', doc.get("name")
Пример #47
0
    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: Explainer <index dir> <query>"

        else:
            indexDir = argv[1]
            queryExpression = argv[2]

            directory = SimpleFSDirectory(File(indexDir))
            query = QueryParser(Version.LUCENE_CURRENT, "contents",
                                SimpleAnalyzer()).parse(queryExpression)

            print "Query:", queryExpression

            searcher = IndexSearcher(directory)
            scoreDocs = searcher.search(query, 50).scoreDocs

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                explanation = searcher.explain(query, scoreDoc.doc)
                print "----------"
                print doc["title"].encode('utf-8')
                print explanation
Пример #48
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:") #输入查询语句
        command = unicode(command, 'GBK')
                        #将查询语句转化为Unicode(注意创建索引时文件也是Unicode)
                        #在Python IDLE下为GBK,在PyScripter-Portable中为UTF8(见Q.ppt)
        if command == '':
            return

        print
        print "Searching for:", command 
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
                        #用analyzer来对查询语句进行词法分析和语言处理。
                        #QueryParser调用parser进行语法分析,形成查询语法树,放到Query中。 
        scoreDocs = searcher.search(query, 50).scoreDocs
                        #IndexSearcher调用search对查询语法树Query进行搜索,得到结果
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name")
Пример #49
0
def run(searcher, analyzer, command):
    while True:
        if command == '':
            return
        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 300).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        text = []
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            temptext = [
                doc.get("url"),
                doc.get('title'),
                doc.get("imgurl"),
                doc.get("price"),
                doc.get("kind")
            ]
            text.append(temptext)
        return text
Пример #50
0
        writer.addDocument(doc)

    print("Indexed lines from stdin (%d documents in index)" %
          (writer.numDocs()))
    print("About to optimize index of %d documents..." % writer.numDocs())
    writer.optimize()
    print("...done optimizing index of %d documents" % writer.numDocs())
    print("Closing index of %d documents..." % writer.numDocs())
    print("...done closing index of %d documents" % writer.numDocs())
    writer.close()

    # RETRIEVAL

    dir = SimpleFSDirectory(File(fullIndexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text",
                        analyzer).parse(u"¿Dónde está La Mancha?")
    MAX = 1000
    hits = searcher.search(query, MAX)

    print("Found %d document(s) that matched query '%s':" %
          (hits.totalHits, query))

    for hit in hits.scoreDocs:
        print(hit.score, hit.doc, hit.toString())
        doc = searcher.doc(hit.doc)
        print(doc.get("text").encode("utf-8"))
        print(doc.get("metadata").encode("utf-8"))
Пример #51
0
		f.write(string)
		f.write(item)

config = ConfigParser()
config.read('config.ini')
r_server = Redis('localhost')
lst = []
search = str(sys.argv[1])
if __name__ == "__main__":
    lucene.initVM()
    indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(search)
    MAX = 1000
    hits = searcher.search(query, MAX)

    #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)

    for hit in hits.scoreDocs:
        if hit.score >= 0.0:
            #print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            #print doc.get("text").encode("utf-8")
            items = doc.get("text").encode("utf-8").split(config.get('counting','delimiter'))
            for item in items:
                if item == search:
                    pass
                elif item not in lst:
Пример #52
0
    def post(self):
        q = self.get_argument("query")
        k = self.get_argument("kTerms")

        # self.write(key)

        # def query(query):
        # query = self.get_argument("q")
        lucene.initVM()
        indexDir = "index"
        dir = SimpleFSDirectory(File(indexDir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        searcher = IndexSearcher(dir)

        query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
        MAX = 10
        hits = searcher.search(query, MAX)

        print "Found %d document(s) that matched query '%s':" % (
            hits.totalHits, query)
        items = []
        rQ = []

        #for key, value in doc_urls.iteritems()
        # print (key, value)

        for hit in hits.scoreDocs:
            #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
            print hit.score, hit.doc, hit.toString()
            print(len(doc_urls))
            items.append(doc_urls[str(hit.doc)])
            print(doc_urls[str(hit.doc)])
            doc = searcher.doc(hit.doc)
            print(hit.doc)
            rQ.append("html_files/" + str(hit.doc))

        i = 0
        rqSize = 0
        for url in rQ:
            rqSize = rqSize + 1
            print(url)
            f = codecs.open(url, 'r')
            html = f.read()
            html = html.decode('utf-8')
            tag_free = strip_tags(html)
            path = 'strippedHTML_files'
            if not os.path.exists(path):
                os.makedirs(path)
            filename = str(i)
            with open(os.path.join(path, filename), 'wb') as temp_file:
                temp_file.write(tag_free.encode('utf-8'))
            i = i + 1

        path = 'strippedHTML_files'
        i = 0
        for filename in os.listdir(path):
            with open(os.path.join(path, filename), 'r') as myfile:
                data = myfile.read()
                stripStopWords(data, i)
                i = i + 1
        if k > 0:
            newQuery = calcNewQuery(k, q, rqSize)
            q = newQuery
            print("new query is ")
            print(q)

        self.render("index.html",
                    title="Results",
                    items=items,
                    query=q,
                    kTerms=k)
Пример #53
0
        format = a
    elif o == "--index":
        indexDir = a
    elif o == "--stats":
        stats = True


class CustomTemplate(Template):
    delimiter = '#'

template = CustomTemplate(format)

fsDir = SimpleFSDirectory(File(indexDir))
searcher = IndexSearcher(fsDir, True)

analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(' '.join(args))
start = datetime.now()
scoreDocs = searcher.search(query, 50).scoreDocs
duration = datetime.now() - start
if stats:
    print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    table = dict((field.name(), field.stringValue())
                 for field in doc.getFields())
    print template.substitute(table)
if __name__ == "__main__":
    usedsentences = numpy.zeros((BLOOM_FILTER_SIZE, ), dtype=numpy.bool)
    print >> sys.stderr, "Just created bloom filter with %d entries" % usedsentences.shape[
        0]
    print >> sys.stderr, stats()

    lucene.initVM()
    # create an index called 'index-dir' in a temp directory
    #    indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
    #                            'index-dir')
    #    indexDir = "/Tmp/REMOVEME.index-dir"
    indexDir = "lucene.ukwac"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    queryparser = QueryParser(Version.LUCENE_30, "text", analyzer)
    searcher = IndexSearcher(dir)

    nonzeros = 0

    for i, l in enumerate(sys.stdin):
        if i % 100 == 0:
            print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % (
                i, percent(nonzeros, BLOOM_FILTER_SIZE))
            print >> sys.stderr, stats()
        l = string.strip(l)

        added_this_sentence = 0
        for newl in retrieve(l, searcher, queryparser):
            # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents
            if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT:
    def searchDocuments(self, view, version, query=None, attribute=None):

        store = self.store

        if query is None:
            query = MatchAllDocsQuery()
        else:
            query = QueryParser("contents", StandardAnalyzer()).parse(query)

        if attribute:
            combinedQuery = BooleanQuery()
            combinedQuery.add(query, BooleanClause.Occur.MUST)
            combinedQuery.add(TermQuery(Term("attribute", attribute.str64())),
                              BooleanClause.Occur.MUST)
            query = combinedQuery

        class _collector(PythonHitCollector):
            def __init__(_self):

                super(_collector, _self).__init__()
                _self.hits = []

            def collect(_self, id, score):

                _self.hits.append((-score, id))

        class _iterator(object):
            def __init__(_self):

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __del__(_self):

                try:
                    if _self.searcher is not None:
                        _self.searcher.close()
                    store.abortTransaction(view, _self.txnStatus)
                except:
                    store.repository.logger.exception("in __del__")

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __iter__(_self):

                _self.txnStatus = store.startTransaction(view)
                _self.searcher = searcher = self.getIndexSearcher()
                _self.collector = _collector()

                searcher.search(query, _self.collector)
                hits = _self.collector.hits

                if hits:
                    heapify(hits)
                    while hits:
                        score, id = heappop(hits)
                        doc = searcher.doc(id)
                        uItem = UUID(doc['item'])

                        if long(doc['version']) <= version:
                            if store._items.isValue(view, version, uItem,
                                                    UUID(doc['value'])):
                                yield uItem, UUID(doc['attribute'])

        return _iterator()
Пример #56
0
    def testBoost(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse("term^2")
        self.assertEqual("term^2.0", q.toString("field"))
Пример #57
0
    'robespierre', 'danton', 'xvi'
]  #, 'marat', 'mirabeau', 'antoinette', 'fayette', 'tyran']#, 'égalité'.decode('utf-8'), 'fraternité'.decode('utf-8'), 'révolution'.decode('utf-8'), 'salut', 'necker', 'napoleon', 'monarchie', 'aristocratie', 'hébert'.decode('utf-8'), 'gironde', 'jacobins', 'feuillants', 'royalistes','royaliste', 'guillotine', 'bastille', 'versailles', 'tuilleries', 'paume', 'constitution', 'etats', 'citoyen', 'democratie']

initVM()
# Get handle to index directory
directory = SimpleFSDirectory(File(STORE_DIR))
# Creates a searcher searching the provided index.
ireader = IndexReader.open(directory, True)
# Implements search over a single IndexReader.
# Use a single instance and use it across queries
# to improve performance.
searcher = IndexSearcher(ireader)
# Get the analyzer
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
# Constructs a query parser. We specify what field to search into.
queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

appearance_dict = {}
for TERM in term_list:
    print 'Searching for: "' + TERM + '"'
    # Create the query
    query = queryParser.parse(TERM)

    # Run the query and get documents that contain the term
    docs_containing_term = searcher.search(query, ireader.numDocs())

    docs = []

    print 'Found ' + str(len(docs_containing_term.scoreDocs)
                         ) + ' documents with the term "' + TERM + '".'
    #hits = searcher.search(query, 1)
Пример #58
0
    # Get handle to index directory
    directory = SimpleFSDirectory(File(STORE_DIR))

    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # Constructs a query parser.
    queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

    # Create a query
    query = queryParser.parse(QUERY_STRING)

    topDocs = searcher.search(query, 50)

    # Get top hits
    scoreDocs = topDocs.scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    HighlightFormatter = SimpleHTMLFormatter()
    query_score = QueryScorer (query)

    highlighter = Highlighter(HighlightFormatter, query_score)
Пример #59
0
def process_query_param(param):
    """
    Escapes and lowercases all query params for searching in the lucene index.
    """
    processed_param = QueryParser.escape(param)
    return processed_param.lower()
Пример #60
0
def main(indexDir, inputDir):
    """Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index"""
    lucene.initVM()

    # Open index
    logger.info("Opening Lucene index [%s]..." % indexDir)
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT)
    searcher = IndexSearcher(dir)

    # Search documents
    onlyfiles = [
        f for f in listdir(inputDir)
        if isfile(join(inputDir, f)) and f.endswith('.json')
    ]
    for f in onlyfiles:
        json_data = open(inputDir + '/' + f)
        data = json.load(json_data)
        # The results collected after comparison
        results = list()

        journal_code = f.split('.')[0]

        for entry in data:
            url = entry['url']
            date = entry['date']
            title = entry['title']

            logger.debug("Processing URL [%s] date [%s] - [%s]" %
                         (url, date, title))

            tt = nltk.word_tokenize(title)
            tokens = []
            for t in tt:
                tokens.append(t.lower())

            entry['similars'] = list()

            for token in tokens:
                q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % (
                    token, date, journal_code, url)
                query = QueryParser(Version.LUCENE_CURRENT, "title",
                                    analyzer).parse(q)
                hits = searcher.search(query, MAX_HITS)

                logger.debug("Found %d document(s) that matched query '%s':" %
                             (hits.totalHits, q))

                for hit in hits.scoreDocs:
                    doc = searcher.doc(hit.doc)
                    logger.debug(doc)
                    entry['similars'].append({
                        'token': token,
                        'url': doc.get('url'),
                        'title': doc.get('title')
                    })

            results.append(entry)
        json_data.close()

        print """<html>
	<body>
	<table><thead>
	<tr>
	<th>Jornal</th><th>Data</th><th>T&iacute;tulo</th><th>URL</th><th>Not&iacute;cias semelhantes</th>
	</tr>
	</thead>
	<tbody>
	"""
        for entry in results:
            similars = entry['similars']
            similars_text = '<ul>'
            for s in similars:
                similars_text += '<li>[%s] [%s] [%s]</li>' % (
                    s['token'].encode('iso-8859-1', 'ignore'),
                    s['title'].encode('iso-8859-1', 'ignore'), s['url'].encode(
                        'iso-8859-1', 'ignore'))
            similars_text += '</ul>'
            print """<tr>
	<td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>
	</tr>
	""" % (journal_code, entry['date'].encode('iso-8859-1', 'ignore'),
            entry['title'].encode('iso-8859-1', 'ignore'), entry['url'].encode(
            'iso-8859-1', 'ignore'), similars_text)