def searchKey(self, key , rank = None):
        query = ""
        try:
            MAX = 100000
            qp = QueryParser(Version.LUCENE_35, "key", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(key)
#             print ("query",query)
                        
            hits = searcher.search(query, MAX)

            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                try:
                    sentence_list.append(eval(doc.get("sentence").encode("utf-8")))
                except:
                    print doc.get("sentence")
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+key)
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []
Exemplo n.º 2
0
def search(r, keyword=""):
    import logging

    logger = logging.getLogger("search")
    bench = Benchmark(logger)
    from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit
    import lucene, os

    os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17"
    lucene.initVM(lucene.CLASSPATH)

    directory = FSDirectory.open(File(CONFIG.INDEX_PATH))
    ROBOT_INDEX = IndexSearcher(directory, True)
    ROBOT_ANALYZER = StandardAnalyzer()

    keyword = keyword or r.GET["keyword"]
    query = QueryParser("context", ROBOT_ANALYZER)
    query = query.parse('"%s"' % keyword)

    bench.start_mark("search")
    hits = ROBOT_INDEX.search(query)
    count = len(hits)
    result = []
    i = 0
    for hit in hits:
        i += 1
        if i > 100:
            break
        doc = Hit.cast_(hit).getDocument()
        result.append(SearchResult(doc, i, keyword))
    ROBOT_INDEX.close()

    et = bench.stop_mark()

    return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
Exemplo n.º 3
0
    def query(indexName, queryString):

        indSearcher = IndexSearcher(SimpleFSDirectory(File(indexName)))
        qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT))
        qp.setDefaultOperator(qp.Operator.AND)
         
        query = qp.parse(queryString.replace("-","_"))
                
        aux = indSearcher.search(query, 100)
        results = aux.scoreDocs
        hits = aux.totalHits
        
        ir = indSearcher.getIndexReader()

        #results = collector.topDocs()
        i = 0

        res = []
    
        for r in results:        
            doc = ir.document(i)
            res.insert(i, doc.get('id'))
            i+=1
            
        return res
    def testPrefixQuery(self):

        parser = QueryParser(Version.LUCENE_CURRENT, "category",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        parser.setLowercaseExpandedTerms(False)

        print parser.parse("/Computers/technology*").toString("category")
Exemplo n.º 5
0
    def search(self, string ,special = None):
        query = ""
        try:
            MAX = 100000
            #for dates such as 1931.08.06
            string = string.replace("."," ")
            
            array = re.findall(r'[\w\s]+',string)
            string = ""
            for item in array:
                string+=item
            qp = QueryParser(Version.LUCENE_35, "title", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(string)
#            print ("query",query)
                        
            hits = searcher.search(query, MAX)
            
            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                sentence_list.append(doc.get("title").encode("utf-8"))
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+string)
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []
Exemplo n.º 6
0
    def searchXYPair(self,x,y):
        """
        Returns all sentences, which are tagged with the given two entities (x,y)
        """
        tmp_hm = {}
        if x == "" or y == "":
            return []
        try:
            array = re.findall(r'[\w\s]+',x)
            x = ""
            for item in array:
                x+=item
            qp = QueryParser(Version.LUCENE_35, "X", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(x)
            MAX = 100000
            result_list = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                y_entry = doc["Y"]
                if y_entry == y:
                    tmp_hm[doc["Sentence"]]=""
                    
            for key in tmp_hm:
                result_list.append(IndexUtils.sentence_wrapper(key))
            tmp_hm = {}
            return result_list
        except:
            print("Fail (search XYPair) in x:"+x+" y:"+y)
            print "Unexpected error:", sys.exc_info()[0]
            print

            
        return []
    def searchForDbpediaURI(self, uri):
        """
        Returns all anchor texts, which are related to the given DBpedia URI.
        Also returns for each anchor text the corresponding URI and the number of how often the anchor appears on the english Wikipedia
        """
        uri_old = uri
        uri = uri.replace("http://dbpedia.org/resource/","")

        array = re.findall(r'[\w\s]+',uri)
        uri = ""
        for item in array:
            uri+=item
        
        try:
            qp = QueryParser(Version.LUCENE_35, "dbpedia_uri", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(uri)
            MAX = 10000
            result = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                dbpedia_uri = doc["dbpedia_uri"].encode("utf-8")
                if dbpedia_uri == uri_old:
                    result.append([doc["anchor"].encode("utf-8"), doc["anchor_uri"].encode("utf-8"), dbpedia_uri, doc["number"].encode("utf-8")])
            return result
        except:
            print("searchForDbpediaURI - Fail in uri: "+uri)
            return []
Exemplo n.º 8
0
    def searchForDbpediaURI(self, uri):
        """
        Returns all sentences, which are tagged with the given DBpedia URI
        """
        print "in searchForDbpediaURI" 
        uri_old = uri
        uri = uri.replace("http://dbpedia.org/ontology/","")
        uri = uri.replace("http://dbpedia.org/property/","")
        uri = uri.replace("http://dbpedia.org/resource/","")

        array = re.findall(r'[\w\s]+',uri)
        uri = ""
        for item in array:
            uri+=item
        
        try:
            qp = QueryParser(Version.LUCENE_35, "URI", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(uri)
            print "query: "+str(query)
            MAX = 500000
            result = []
            hits = searcher.search(query, MAX)
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                dbpedia_uri = doc["URI"]
                if dbpedia_uri == uri_old:
                    result.append([IndexUtils.sentence_wrapper(doc["Sentence"]), doc["X"], doc["Y"],dbpedia_uri])
            return result
        except:
            print("Fail in uri: "+uri)
            print "Unexpected error:", sys.exc_info()[0]
            return result
Exemplo n.º 9
0
 def does_line_existNew(self,line,x,y):
     """
     Checks, if parsed sentence already exists in index
     """
     query = ""
     try:
         array = re.findall(r'[\w]+',line)
         string = ""
         for item in array:
             string+=item+" "
         qp = QueryParser(Version.LUCENE_35, "Sentence", analyzer)
         qp.setDefaultOperator(qp.Operator.AND)
         query = qp.parse(string)
         
         MAX = 10
         hits = searcher.search(query, MAX)
         if len(hits.scoreDocs)>0:
             return True
         else:
             return False
     except Exception:
         s_tmp =  str(sys.exc_info())
         if "too many boolean clauses" in s_tmp:
             print "too many boolean clauses"
             """
             Returns true, so that the sentence is not added each time, to avoid further error messages.
             Only occours with very large sentences.
             """
             return True
         else:
             print "Unexpected error:", sys.exc_info()[0]
             print "in does line exist"
             print s_tmp
     return False
Exemplo n.º 10
0
    def searchString(self, string):
        'searches for a string and returns an array of POS-tagged sentences'
        query = ""
        #print("Input String: ",string)
        try:
            MAX = 100000
            #for dates such as 1931.08.06
            string = string.replace("."," ")
            
            array = re.findall(r'[\w\s]+',string)
            string = ""
            for item in array:
                string+=item
            #print("Input String2: ",string)
            qp = QueryParser(Version.LUCENE_35, "sentence", analyzer)
            qp.setDefaultOperator(qp.Operator.AND)
            query = qp.parse(string)
            #print ("query",query)
                        
            hits = searcher.search(query, MAX)
            #print len(hits)
            sentence_list = []
            for hit in hits.scoreDocs:
                doc = searcher.doc(hit.doc)
                #print doc.get("sentence")
                sentence_list.append(eval(doc.get("sentence").encode("utf-8")))
            return sentence_list
        except:
            print("Fail in receiving sentence with term "+string+" in search term")
            print ("query",query)
            print "Unexpected error:", sys.exc_info()[0]
#            raw_input("wait")
            print
            return []
Exemplo n.º 11
0
    def main(cls, argv):

        allBooks = MatchAllDocsQuery()
        parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        query = BooleanQuery()
        query.add(allBooks, BooleanClause.Occur.SHOULD)
        query.add(parser.parse("java OR action"), BooleanClause.Occur.SHOULD)

        indexDir = System.getProperty("index.dir")
        directory = SimpleFSDirectory(File(indexDir))

        example = SortingExample(directory)

        example.displayResults(query, Sort.RELEVANCE)
        example.displayResults(query, Sort.INDEXORDER)
        example.displayResults(query,
                               Sort(SortField("category", SortField.STRING)))
        example.displayResults(query,
                               Sort(SortField("pubmonth", SortField.INT, True)))

        example.displayResults(query,
                               Sort([SortField("category", SortField.STRING),
                                     SortField.FIELD_SCORE,
                                     SortField("pubmonth", SortField.INT, True)]))

        example.displayResults(query,
                               Sort([SortField.FIELD_SCORE,
                                     SortField("category", SortField.STRING)]))
        directory.close()
Exemplo n.º 12
0
    def testPhraseQuery(self):

        analyzer = StandardAnalyzer(Version.LUCENE_24)
        q = QueryParser(Version.LUCENE_24, "field", analyzer).parse('"This is Some Phrase*"')
        self.assertEqual('"some phrase"', q.toString("field"), "analyzed")

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"term"')
        self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
Exemplo n.º 13
0
    def main(cls):

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            cls.synonymAnalyzer).parse('"fox jumps"')
        print "\"fox jumps\" parses to ", query.toString("content")

        print "From AnalyzerUtils.tokensFromAnalysis: "
        AnalyzerUtils.displayTokens(cls.synonymAnalyzer, "\"fox jumps\"")
        print ''
Exemplo n.º 14
0
def extractFeatureQueryWords(query):
    import string
    from lucene import Document, TermQuery, Term
    
    # create analyzer
    aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    try:
        file = open('../features.txt', 'r')
        
        featurelist = []
        for line in file.readlines():
            words_in_line = line.split()
            featurelist += words_in_line
             
        querywordlist = query.split()
        
        featureQueryList = []
        productQueryList = []
        
        for word in querywordlist:
            if word in featurelist:
                featureQueryList.append(word)
            else:
                # create parser for word
                aux_parser = QueryParser(Version.LUCENE_CURRENT, "title", aux_analyzer)
                aux_query = aux_parser.parse(word)
                scoreDocs = searcher.search(aux_query, 50).scoreDocs
                if scoreDocs:
                    productQueryList.append(word)

        
        featureQuery = ""
        if featureQueryList:
            featureQuery = "("
            for i in range(len(featureQueryList)):
                if i == len(featureQueryList) - 1:
                    featureQuery += featureQueryList[i] + ")"
                else:
                    featureQuery += featureQueryList[i] + " AND "
                
            print featureQuery
        
        productQuery = ""
        if productQueryList:
            productQuery = "("
            for i in range(len(productQueryList)):
                if i == len(productQueryList) - 1:
                    productQuery += productQueryList[i] + ")"
                else:
                    productQuery += productQueryList[i] + " AND "
            
        return (featureQuery, productQuery, featureQueryList, productQueryList)
    except Exception, ex:
        print "Could not separate feature query words. Reason: ", ex
        return ("", "(" + query + ")", [], querywordlist)
Exemplo n.º 15
0
    def __init__(self, emoticon, searcher, analyzer, english_only=False):
        super(PMICalculator, self).__init__()

        self.field = "emoticons"
        self.emoticon = emoticon
        self.searcher = searcher
        self.analyzer = analyzer
        self.escaped_emoticon = QueryParser.escape(self.emoticon)
        self.query = QueryParser("emoticons", self.analyzer).parse(self.escaped_emoticon)
        self.raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/"
        if english_only:
            country = "United States"
            country_prefix = "US"
        else:
            country = None
            country_prefix = ""
        self.pmi_file_name = (
            self.raw_stats_dir
            + normalizeEmoticonName(self.emoticon).rstrip("_")
            + ("_%s" % (country_prefix)) * english_only
            + ".pmidata"
        )
        self.sample_tweets_name = (
            self.raw_stats_dir
            + normalizeEmoticonName(self.emoticon).rstrip("_")
            + ("_%s" % (country_prefix)) * english_only
            + ".samptweets"
        )
        self.sample_tweets_file = codecs.open(self.sample_tweets_name, encoding="utf-8", mode="w")
        self.term_count_collector = TermCountCollector(searcher, emoticon, country)
        print "starting query at: ", time.time()
        hits = self.searcher.search(self.query, self.term_count_collector)
        # print "terms: ", self.terms
        if emoticon == ":P":
            ee_two = QueryParser.escape(":p")
        elif emoticon == "T_T":
            ee_two = QueryParser.escape("TT")
        elif emoticon == "^_^":
            ee_two = QueryParser.escape("^^")
        if emoticon in [":P", "T_T", "^_^"]:
            q_two = QueryParser("emoticons", self.analyzer).parse(ee_two)
            hits_two = self.searcher.search(q_two, self.term_count_collector)
        self.terms = self.term_count_collector.getTerms()
        self.query_result_count = self.term_count_collector.getDocCount()
        for p_term, p_term_tweets in self.term_count_collector.popular_terms_hash.items():
            for p_term_tweet in p_term_tweets:
                self.sample_tweets_file.write("term: " + p_term + " tweet: " + p_term_tweet + "\n")
        self.sample_tweets_file.close()
        self.base_stats_file = open(
            "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_pmi_stats.txt", "r"
        )
        self.n = int(self.base_stats_file.read().strip().split(":")[1])

        print "computing PMI for query: ", self.emoticon, " at: ", time.time()

        self.p_query_result = self.query_result_count * 1.0 / self.n
Exemplo n.º 16
0
    def testBasicQueryParser(self):

        analyzer = SimpleAnalyzer()
        query = QueryParser(Version.LUCENE_CURRENT, "description",
                            analyzer).parse("partnum:Q36 AND SPACE")

        scoreDocs = self.searcher.search(query, 50).scoreDocs
        self.assertEqual("+partnum:q +space", query.toString("description"),
                         "note Q36 -> q")
        self.assertEqual(0, len(scoreDocs), "doc not found :(")
Exemplo n.º 17
0
    def testPhraseQuery(self):

        analyzer = StandardAnalyzer(Version.LUCENE_24)
        q = QueryParser(Version.LUCENE_24, "field",
                        analyzer).parse('"This is Some Phrase*"')
        self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed")

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse('"term"')
        self.assert_("TermQuery" == q.getClassName(), "reduced to TermQuery")
Exemplo n.º 18
0
def getResultScoreDocs(query):
    # create analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    # create parser for user submitted query
    parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    formatted_query = parser.parse(query)
    scoreDocs = searcher.search(formatted_query, 50).scoreDocs
    
    return scoreDocs
    def testWithSlop(self):

        searcher = IndexSearcher(self.directory, True)

        parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                             self.porterAnalyzer)
        parser.setPhraseSlop(1)

        query = parser.parse('"over the lazy"')
        topDocs = searcher.search(query, 50)

        self.assertEqual(1, topDocs.totalHits, "hole accounted for")
Exemplo n.º 20
0
def build_advanced_search_query(params, operator, analyzer):
    """
    Takes a dictionary containing key=value pairs where keys are fields in our
    lucene document and values are search terms provided by the user. A 
    BooleanQuery is built from these key=value pairs
    """
    parser = QueryParser(Version.LUCENE_CURRENT, "name", analyzer)        
    query_list = ["%s:\"%s\"" % (field, process_query_param(val)) 
                                   for (field, val) in 
                                        get_adv_query_packet(params)]

    return parser.parse("%s" % (" " + operator + " ").join(query_list))        
Exemplo n.º 21
0
def pesquisar_com_lucene():
    initVM()
    #print 'lucene', VERSION

    # Get handle to index directory
    directory = SimpleFSDirectory(File(STORE_DIR))

    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)

    for query in querys:
        query_number =  query.query_number
        # Constructs a query parser. We specify what field to search into.
        query.query_text = query.query_text.replace('?','')
        query.query_text = query.query_text.replace('*','')
        queryParser = QueryParser(Version.LUCENE_CURRENT,
                                  FIELD_CONTENTS, analyzer)

        # Create the query
        query = queryParser.parse(query.query_text)

        # Run the query and get top 50 results
        topDocs = searcher.search(query,50000)

        # Get top hits
        scoreDocs = topDocs.scoreDocs

        r = resultado_query(query_number,scoreDocs)
        resultados.append(r)
        #print "%s total matching documents." % len(scoreDocs)
        #for scoreDoc in scoreDocs:
        #    doc = searcher.doc(scoreDoc.doc)
        #    print doc.get(FIELD_PATH)

    with open('resultados_da_busca/resultados.csv', 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in resultados:
            resultados_da_row = []
            i = 1
            for resultado_da_query in row.query_results:
                doc = searcher.doc(resultado_da_query.doc)
                resultados_da_row.append((i,int(doc.get(FIELD_PATH))))
                i = i + 1
            spamwriter.writerow([row.query_number,resultados_da_row])
Exemplo n.º 22
0
    def testDateRangeQuery(self):

        # locale diff between jre and gcj 1/1/04 -> 01/01/04
        # expression = "modified:[1/1/04 TO 12/31/04]"

        expression = "modified:[01/01/04 TO 12/31/04]"
        parser = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer)
        parser.setLocale(Locale.US)
        query = parser.parse(expression)
        print expression, "parsed to", query

        topDocs = self.searcher.search(query, 50)
        self.assert_(topDocs.totalHits > 0)
Exemplo n.º 23
0
    def testTermRangeQuery(self):

        query = QueryParser(Version.LUCENE_CURRENT, "subject",
                            self.analyzer).parse("title2:[K TO N]")
        self.assert_(query.getClassName() == "TermRangeQuery")

        scoreDocs = self.searcher.search(query, 10).scoreDocs
        self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms")

        query = QueryParser(Version.LUCENE_CURRENT, "subject",
                            self.analyzer).parse("title2:{K TO Mindstorms}")
        scoreDocs = self.searcher.search(query, 10).scoreDocs
        self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms",
                                    True)
Exemplo n.º 24
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return

        print
        print "Searching for:", command
        parsed_command = QueryParser.escape(command)
        query = QueryParser("text", analyzer).parse(parsed_command)
        hits = searcher.search(query)
        print "%s total matching documents." % hits.length()

        try:
            hctr = 0
            for hit in hits:
                hit_id = hits.id(hctr), 
                hit_tv = searcher.getIndexReader().getTermFreqVector(hits.id(hctr), "text")
                trm_str = ""
                for trm in hit_tv.getTerms(): trm_str += " " + trm
                print "term string: ", trm_str.encode("ascii","ignore")
                hctr += 1
                if hctr > hits.length()-2 or hctr > 100: break
                print 'uid:', hit.get("user_id"), 'timestamp: ', hit.get("timestamp"), "country: ", hit.get('country'), "emoticons: ", hit.get('emoticons')
        except Exception, e: 
            print "failed to list hit: ", e

        print
        command = raw_input("Query:")
        parsed_command = QueryParser.escape(command)
        print "Searching for emoticon:", parsed_command
        query = QueryParser("emoticons", analyzer).parse(parsed_command)
        hits = searcher.search(query)
        print "%s total matching documents." % hits.length()

        try:
            hctr = 0
            for hit in hits:
                hit_id = hits.id(hctr), 
                hit_tv = searcher.getIndexReader().getTermFreqVector(hits.id(hctr), "text")
                trm_str = ""
                for trm in hit_tv.getTerms(): trm_str += " " + trm
                print "term string: ", trm_str.encode("ascii","ignore")
                hctr += 1
                if hctr > hits.length()-2 or hctr > 100: break
                print 'uid:', hit.get("user_id"), 'timestamp: ', hit.get("timestamp"), "country: ", hit.get('country'), "emoticons: ", hit.get('emoticons')
        except Exception, e: 
            print "failed to list hit: ", e
Exemplo n.º 25
0
    def query(indexName, queryFile, runName):
        indReader = IndexReader.open(SimpleFSDirectory(File(indexName)))
        indSearcher = IndexSearcher(indReader)
        ir = indSearcher.getIndexReader()

        qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT))

        f = open('results-'+runName, 'w')

        while(True):
            id = queryFile.readline()

            if id == "":
                break

            id = id.replace("C","")
            id = id.replace("\n","")

            queryString = queryFile.readline()
            queryString = queryString.replace("?","")
            queryString = queryString.replace("*","")
            queryString = queryString.replace("-","_")
            queryString = queryString.replace("\n","")

            query = qp.parse(queryString)

            queryFile.readline()

            returnedDocs = 1000
            collector = TopScoreDocCollector.create(returnedDocs, True)

            indSearcher.search(query, collector)

            hits = collector.topDocs().scoreDocs

            size = len(hits)
            print "Total hits for query " +id+ ": "+str(size)

            i = 0
            for hit in hits:        
                docId = hits[i].doc
                score = hits[i].score
                doc = ir.document(docId)
                j = i + 1
                f.write(id + " 0 " + doc.get('id') + " " + str(j) + " " + str(score) +" " + runName +"\n")
                i+=1

        f.close()
def calculateEmoticonDiffusion(emoticon, searcher, analyzer, user_location_hash, usage_threshold = 1, comm_threshold = 1):
    raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/"
    emoticon_stats_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_diffusion_stats.txt","r") 
    total_users = int(emoticon_stats_file.read().strip())
    emoticon_stats_file.close()

    emoticon_file_name = raw_stats_dir + normalizeEmoticonName(emoticon).rstrip('_')+".diffusion_bidir"
    print "Calculating Diffusion for: ", emoticon, " at: ", time.time()
    escaped_emoticon = QueryParser.escape(emoticon)
    query = QueryParser("emoticons", analyzer).parse(escaped_emoticon)
    hits = searcher.search(query)
    print "%s total matching documents." % hits.length()
    if hits.length() == 0: return

    print "compiling diffusion stats at: ", time.time()
    emoticon_users_by_time_hash = {}
    emoticon_users_adopters_hash = {}
    emoticon_users_non_adopters_hash = {}
    users_exposure_hash = {}
    reverse_users_exposure_hash = {}
    try:
        hctr = 0
        for hit in hits:
            hctr += 1
            if hctr%100000==0: print "on hit: ", hctr
            #if hctr > 100000: break
            if hctr == hits.length(): break
            uid, timestamp, country, emoticons, user_id_replied = hit.get("user_id"), int(hit.get("timestamp")), hit.get('country'), hit.get('emoticons'), hit.get('user_id_replied')
            emoticon_users_by_time_hash[uid] = emoticon_users_by_time_hash.get(uid,[])+[timestamp]
    except Exception, e:
        pass
Exemplo n.º 27
0
def boolean_search_lucene_index(index_dir, query_text, limit):
    '''
    This function searches a boolean query in the learned lucene index 
    
    Arguments: 
        index_dir - the lucene index directory 
        query_text - the query text which follows http://lucene.apache.org/core/3_6_0/queryparsersyntax.html
        limit - the number of records to be retrieved 
    Return: 
        rows - the returned document details 

    
    '''
    DEFAULT_QUERY_FIELD = 'all'
    
    
    store = SimpleFSDirectory(File(index_dir))
    
    searcher = IndexSearcher(store, True)
    parser = QueryParser(Version.LUCENE_CURRENT, DEFAULT_QUERY_FIELD, STD_ANALYZER)
    query = parser.parse(query_text)
    
    start = datetime.datetime.now()
    scoreDocs = searcher.search(query, limit).scoreDocs
    duration = datetime.datetime.now() - start
    
    # print "Lucene Search: Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

    
    rows = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        table = dict((field.name(), field.stringValue())
                     for field in doc.getFields())
        row = []
        metadata = MetadataType._types
        for field in metadata:
            if table.get(field,'empty') != 'empty' :
                row.append(table.get(field,'empty'))
            else: 
                row.append('')
        row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file 
        row.append(scoreDoc.score)
        
        rows.append(row)
    
    return rows
    def testAnalyzer(self):

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryString = "category:/philosophy/eastern"

        query = QueryParser(Version.LUCENE_CURRENT,
                            "contents", analyzer).parse(queryString)

        self.assertEqual("category:\"philosophy eastern\"",
                         query.toString("contents"), "path got split, yikes!")

        perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer)
        perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer())
        query = QueryParser(Version.LUCENE_CURRENT,
                            "contents", perFieldAnalyzer).parse(queryString)

        self.assertEqual("category:/philosophy/eastern",
                         query.toString("contents"),
                         "leave category field alone")
Exemplo n.º 29
0
    def testLowercasing(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse("PrefixQuery*")
        self.assertEqual("prefixquery*", q.toString("field"), "lowercased")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setLowercaseExpandedTerms(False)
        q = qp.parse("PrefixQuery*")
        self.assertEqual("PrefixQuery*", q.toString("field"), "not lowercased")
Exemplo n.º 30
0
    def testSlop(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"exact phrase"')
        self.assertEqual('"exact phrase"', q.toString("field"), "zero slop")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setPhraseSlop(5)
        q = qp.parse('"sloppy phrase"')
        self.assertEqual('"sloppy phrase"~5', q.toString("field"), "sloppy, implicitly")
def run(searcher, analyzer, command):
    # print "Searching for:", command
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 10).scoreDocs
    # print "%s total matching documents." % len(scoreDocs)
    rankedfiles = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        # print 'path:', doc.get("path"), 'name:', doc.get("name")
        rankedfiles.append(int(doc.get("name")))
    return rankedfiles
Exemplo n.º 32
0
class IndexSearcherWrapper(object):
    def __init__(self, location):
        lucene.initVM()
        directory = SimpleFSDirectory(File(location))
        self.reader = IndexReader.open(directory, True)
        self.searcher = IndexSearcher(self.reader)
        self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text",
                                        WhitespaceAnalyzer())

    def search(self, topic, max=5000):
        query = self.query_parser.parse(topic.title)
        return self.searcher.search(query, max)
Exemplo n.º 33
0
    def testPrefixQuery(self):

        parser = QueryParser(Version.LUCENE_CURRENT, "category",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        parser.setLowercaseExpandedTerms(False)

        print parser.parse("/Computers/technology*").toString("category")
Exemplo n.º 34
0
    def LatestSearch(self):
        try:
            searcher = IndexSearcher(self.indexDir)
            today = time.strftime('%Y%m%d')
            keyWord = today.encode('utf8')
            print keyWord
            query = QueryParser(Version.LUCENE_30, "regDate",
                                self.analyzer).parse(keyWord)

            hits = searcher.search(query, 1000)
            return self.__MakeResultFormat(hits, searcher)
        except:
            print 'BookSearcher TotalSearch Exception'
Exemplo n.º 35
0
def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs
Exemplo n.º 36
0
def run(searcher, analyzer, input, filepath):
        #input = raw_input("Query:").decode('gbk').encode('utf8')
	#print "Search for: " + input
	command = convert(input.decode('gbk').encode('utf8'))
	print "Search for:" + command.decode('utf8').encode('gbk')
        qp = QueryParser(Version.LUCENE_CURRENT, "sentence", analyzer)
        #qp.setPhraseSlop(0)
        query = qp.parse(command)
        scoreDocs = searcher.search(query, 1000000).scoreDocs
        print "%s total matching documents." % len(scoreDocs)
        print
        
	try:
		#filepath = "D:\\TotalCode\\PyluceneSample\\Output_pylucene.txt"
		filew = open(filepath, 'w')
		result_num = 0
        	for scoreDoc in scoreDocs:
			try:
				result_num += 1
				if result_num % 1000 == 0:
				#	time.sleep(5)
		    			print "Search added " + str(result_num) + " sentences..."
			#print 'scoreDoc.doc:', scoreDoc.doc
            			doc = searcher.doc(scoreDoc.doc)
	    			path = doc.get("path")
				#print "path:" +  path
			#print 'name:', doc.get("name")
	    		#print 'sentence_num:', str(doc.get("sentence_num"))
	    		#print 'sentence:', doc.get("sentence")
				#sentence = GetSentence(doc.get("sentence_num"), path)
				sentence = doc.get("sentence")
	    			#print 'sentence:', sentence
				OutputSentence(filew, doc.get("name"), sentence)
			except:
				continue
		filew.close()
	except: #Exception, e:
		print "Failed in Outputsentence:"#, e
Exemplo n.º 37
0
def query(query):
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)]
        doc = searcher.doc(hit.doc)
Exemplo n.º 38
0
    def search(cls, indexDir, q):

        fsDir = SimpleFSDirectory(File(indexDir))
        searcher = IndexSearcher(fsDir, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q)
        start = time()
        hits = searcher.search(query, 50).scoreDocs
        duration = timedelta(seconds=time() - start)

        print "Found %d document(s) (in %s) that matched query '%s':" % (
            len(hits), duration, q)

        for hit in hits:
            doc = searcher.doc(hit.doc)
            print 'path:', doc.get("path")
Exemplo n.º 39
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return

        print
        print "Searching for:", command
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name")
Exemplo n.º 40
0
    def testAnalyzer(self):

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryString = "category:/philosophy/eastern"

        parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        parser.setAutoGeneratePhraseQueries(True)
        query = parser.parse(queryString)

        self.assertEqual("category:\"philosophy eastern\"",
                         query.toString("contents"), "path got split, yikes!")

        perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer)
        perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer())
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            perFieldAnalyzer).parse(queryString)

        self.assertEqual("category:/philosophy/eastern",
                         query.toString("contents"),
                         "leave category field alone")
Exemplo n.º 41
0
    def find(self, query, indir):
        lucene.initVM()
        INDEXDIR = indir

        indir = SimpleFSDirectory(File(INDEXDIR))
        lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
        lucene_searcher = IndexSearcher(indir)
        my_query = QueryParser(Version.LUCENE_30,"<default field>",\
        lucene_analyzer).parse("text:" + query + " OR title:" + query)
        MAX = 1000
        total_hits = lucene_searcher.search(my_query, MAX)
        print "\nHits: ", total_hits.totalHits, "\n"

        for hit in total_hits.scoreDocs:
            print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc(
                hit.doc).get("department").encode(
                    "utf-8"), "Title:", lucene_searcher.doc(
                        hit.doc).get("title").encode("utf-8")
            print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'
Exemplo n.º 42
0
def run(searcher, analyzer, querystr):
    query = QueryParser(lucene.Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(querystr)
    MAX = 1000
    hits = searcher.search(query, MAX)

    results = []

    for sd in hits.scoreDocs:
        doc = searcher.doc(sd.doc)
        results.append([
            doc.get("name"),
            doc.get("owner").encode('gbk'),
            doc.get("title").encode('gbk')
        ])

    # sort result
    results.sort(lambda x, y: cmp(x[0], y[0]))
    for name, owner, title in results:
        print name, owner, title
Exemplo n.º 43
0
def run(searcher, analyzer, command, prior):
    if command == '':
        return

    store = []

    command_dict = parseCommand(command)
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 500000).scoreDocs

    scored = []

    for scoreDoc in scoreDocs:
        returnfile = []
        doc = searcher.doc(scoreDoc.doc)

        if doc.get("qst_num") in scored:
            continue
        if not doc.get("qst_name"):
            continue
        scored.append(doc.get("qst_num"))

        name = doc.get("qst_name").replace(' ', '')
        returnfile.append(name)
        detail = doc.get("qst_detail").replace(' ', '')
        returnfile.append(detail)

        returnfile.append(doc.get("qst_topic_accu"))
        returnfile.append(int(doc.get("qst_browse")))
        returnfile.append(int(doc.get("qst_follow")))
        returnfile.append(int(doc.get("qst_ans")))
        returnfile.append(int(doc.get("qst_num")))

        store.append(returnfile)

    store = storesort(store, prior)
    return store
Exemplo n.º 44
0
def luceneRetriver(query):
    #print ('-------------Searching-------------')
    #print (query)
    lucene.initVM()
    indir = SimpleFSDirectory(File(INDEXDIR))
    lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
    lucene_searcher = IndexSearcher(indir)
    my_query = QueryParser(Version.LUCENE_30, 'text',
                           lucene_analyzer).parse(query)
    MAX = 1000

    #存放返回的文档标题list
    title_list = []

    total_hits = lucene_searcher.search(my_query, MAX)

    #print "Hits: ", total_hits.totalHits

    for hit in total_hits.scoreDocs[:10]:

        #print"Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString()

        doc = lucene_searcher.doc(hit.doc)

        #print doc.get("title").encode("utf-8").lstrip(str(TXTDIR))
        #print doc.get("text").encode("utf-8")
        #print ('\n')

        title_list.append({
            doc.get("title").encode("utf-8").lstrip(str(TXTDIR)):
            round(hit.score, 5)
        }.copy())

    return title_list


#print ('查询内容:八卦')
#print ('查询结果:')
#print ('\n')
#luceneRetriver("下列 关于 中国 八卦 不正确 人类 历史 东西方 平等 交流 见证")
Exemplo n.º 45
0
    def find(self, phrase):
        phrase = phrase.lower().encode('utf8')
        query = ' '.join(['+' + word for word in phrase.split(' ')])
        query = QueryParser(self._lversion, 'contents',
                            self._analyzer).parse(query)
        hits = self._searcher.search(query, self.max_candidates)

        # if not hits.totalHits: print "%d documents for '%s'" % (hits.totalHits, str(query)) # potential bug

        # todo put article_id in lucene index instead of translating document title

        links = {}
        for hit in hits.scoreDocs:
            title = quote(
                self._searcher.doc(
                    hit.doc).get("title").encode('utf-8').replace(
                        ' ', '_')).replace('%28', '(').replace('%29', ')')
            if title in self._translation:
                links[self._translation[title]] = hit.score
            # else: print title # potential bug

        return self._links[phrase].get(-1, 0), links
Exemplo n.º 46
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'GBK')
        if command == '':
            return

        command_dict = parseCommand(command)
        sep_command = " ".join(jieba.cut(command_dict['contents']))
        command_dict['contents'] = sep_command
        #print command_dict
        if not command_dict.has_key('site'):
            command = command_dict['contents']
        else:
            command = command_dict['contents'] + " site:" + command_dict['site']
        print
        print "Searching for:", command

        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            print k, v
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print "------------------------"
            #print 'site:', doc.get("site")
            print 'path:', doc.get("path")
            print 'title:', doc.get("title")
            print 'url:', doc.get("url")
            print 'name:', doc.get("name")
Exemplo n.º 47
0
    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: Explainer <index dir> <query>"

        else:
            indexDir = argv[1]
            queryExpression = argv[2]

            directory = SimpleFSDirectory(File(indexDir))
            query = QueryParser(Version.LUCENE_CURRENT, "contents",
                                SimpleAnalyzer()).parse(queryExpression)

            print "Query:", queryExpression

            searcher = IndexSearcher(directory)
            scoreDocs = searcher.search(query, 50).scoreDocs

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                explanation = searcher.explain(query, scoreDoc.doc)
                print "----------"
                print doc["title"].encode('utf-8')
                print explanation
Exemplo n.º 48
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:") #输入查询语句
        command = unicode(command, 'GBK')
                        #将查询语句转化为Unicode(注意创建索引时文件也是Unicode)
                        #在Python IDLE下为GBK,在PyScripter-Portable中为UTF8(见Q.ppt)
        if command == '':
            return

        print
        print "Searching for:", command 
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
                        #用analyzer来对查询语句进行词法分析和语言处理。
                        #QueryParser调用parser进行语法分析,形成查询语法树,放到Query中。 
        scoreDocs = searcher.search(query, 50).scoreDocs
                        #IndexSearcher调用search对查询语法树Query进行搜索,得到结果
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name")
Exemplo n.º 49
0
def run(searcher, analyzer, command):
    while True:
        if command == '':
            return
        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 300).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        text = []
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            temptext = [
                doc.get("url"),
                doc.get('title'),
                doc.get("imgurl"),
                doc.get("price"),
                doc.get("kind")
            ]
            text.append(temptext)
        return text
Exemplo n.º 50
0
        writer.addDocument(doc)

    print("Indexed lines from stdin (%d documents in index)" %
          (writer.numDocs()))
    print("About to optimize index of %d documents..." % writer.numDocs())
    writer.optimize()
    print("...done optimizing index of %d documents" % writer.numDocs())
    print("Closing index of %d documents..." % writer.numDocs())
    print("...done closing index of %d documents" % writer.numDocs())
    writer.close()

    # RETRIEVAL

    dir = SimpleFSDirectory(File(fullIndexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text",
                        analyzer).parse(u"¿Dónde está La Mancha?")
    MAX = 1000
    hits = searcher.search(query, MAX)

    print("Found %d document(s) that matched query '%s':" %
          (hits.totalHits, query))

    for hit in hits.scoreDocs:
        print(hit.score, hit.doc, hit.toString())
        doc = searcher.doc(hit.doc)
        print(doc.get("text").encode("utf-8"))
        print(doc.get("metadata").encode("utf-8"))
Exemplo n.º 51
0
		f.write(string)
		f.write(item)

config = ConfigParser()
config.read('config.ini')
r_server = Redis('localhost')
lst = []
search = str(sys.argv[1])
if __name__ == "__main__":
    lucene.initVM()
    indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(search)
    MAX = 1000
    hits = searcher.search(query, MAX)

    #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)

    for hit in hits.scoreDocs:
        if hit.score >= 0.0:
            #print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            #print doc.get("text").encode("utf-8")
            items = doc.get("text").encode("utf-8").split(config.get('counting','delimiter'))
            for item in items:
                if item == search:
                    pass
                elif item not in lst:
Exemplo n.º 52
0
    def post(self):
        q = self.get_argument("query")
        k = self.get_argument("kTerms")

        # self.write(key)

        # def query(query):
        # query = self.get_argument("q")
        lucene.initVM()
        indexDir = "index"
        dir = SimpleFSDirectory(File(indexDir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        searcher = IndexSearcher(dir)

        query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
        MAX = 10
        hits = searcher.search(query, MAX)

        print "Found %d document(s) that matched query '%s':" % (
            hits.totalHits, query)
        items = []
        rQ = []

        #for key, value in doc_urls.iteritems()
        # print (key, value)

        for hit in hits.scoreDocs:
            #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
            print hit.score, hit.doc, hit.toString()
            print(len(doc_urls))
            items.append(doc_urls[str(hit.doc)])
            print(doc_urls[str(hit.doc)])
            doc = searcher.doc(hit.doc)
            print(hit.doc)
            rQ.append("html_files/" + str(hit.doc))

        i = 0
        rqSize = 0
        for url in rQ:
            rqSize = rqSize + 1
            print(url)
            f = codecs.open(url, 'r')
            html = f.read()
            html = html.decode('utf-8')
            tag_free = strip_tags(html)
            path = 'strippedHTML_files'
            if not os.path.exists(path):
                os.makedirs(path)
            filename = str(i)
            with open(os.path.join(path, filename), 'wb') as temp_file:
                temp_file.write(tag_free.encode('utf-8'))
            i = i + 1

        path = 'strippedHTML_files'
        i = 0
        for filename in os.listdir(path):
            with open(os.path.join(path, filename), 'r') as myfile:
                data = myfile.read()
                stripStopWords(data, i)
                i = i + 1
        if k > 0:
            newQuery = calcNewQuery(k, q, rqSize)
            q = newQuery
            print("new query is ")
            print(q)

        self.render("index.html",
                    title="Results",
                    items=items,
                    query=q,
                    kTerms=k)
Exemplo n.º 53
0
        format = a
    elif o == "--index":
        indexDir = a
    elif o == "--stats":
        stats = True


class CustomTemplate(Template):
    delimiter = '#'

template = CustomTemplate(format)

fsDir = SimpleFSDirectory(File(indexDir))
searcher = IndexSearcher(fsDir, True)

analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(' '.join(args))
start = datetime.now()
scoreDocs = searcher.search(query, 50).scoreDocs
duration = datetime.now() - start
if stats:
    print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    table = dict((field.name(), field.stringValue())
                 for field in doc.getFields())
    print template.substitute(table)
Exemplo n.º 54
0
if __name__ == "__main__":
    usedsentences = numpy.zeros((BLOOM_FILTER_SIZE, ), dtype=numpy.bool)
    print >> sys.stderr, "Just created bloom filter with %d entries" % usedsentences.shape[
        0]
    print >> sys.stderr, stats()

    lucene.initVM()
    # create an index called 'index-dir' in a temp directory
    #    indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
    #                            'index-dir')
    #    indexDir = "/Tmp/REMOVEME.index-dir"
    indexDir = "lucene.ukwac"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    queryparser = QueryParser(Version.LUCENE_30, "text", analyzer)
    searcher = IndexSearcher(dir)

    nonzeros = 0

    for i, l in enumerate(sys.stdin):
        if i % 100 == 0:
            print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % (
                i, percent(nonzeros, BLOOM_FILTER_SIZE))
            print >> sys.stderr, stats()
        l = string.strip(l)

        added_this_sentence = 0
        for newl in retrieve(l, searcher, queryparser):
            # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents
            if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT:
    def searchDocuments(self, view, version, query=None, attribute=None):

        store = self.store

        if query is None:
            query = MatchAllDocsQuery()
        else:
            query = QueryParser("contents", StandardAnalyzer()).parse(query)

        if attribute:
            combinedQuery = BooleanQuery()
            combinedQuery.add(query, BooleanClause.Occur.MUST)
            combinedQuery.add(TermQuery(Term("attribute", attribute.str64())),
                              BooleanClause.Occur.MUST)
            query = combinedQuery

        class _collector(PythonHitCollector):
            def __init__(_self):

                super(_collector, _self).__init__()
                _self.hits = []

            def collect(_self, id, score):

                _self.hits.append((-score, id))

        class _iterator(object):
            def __init__(_self):

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __del__(_self):

                try:
                    if _self.searcher is not None:
                        _self.searcher.close()
                    store.abortTransaction(view, _self.txnStatus)
                except:
                    store.repository.logger.exception("in __del__")

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __iter__(_self):

                _self.txnStatus = store.startTransaction(view)
                _self.searcher = searcher = self.getIndexSearcher()
                _self.collector = _collector()

                searcher.search(query, _self.collector)
                hits = _self.collector.hits

                if hits:
                    heapify(hits)
                    while hits:
                        score, id = heappop(hits)
                        doc = searcher.doc(id)
                        uItem = UUID(doc['item'])

                        if long(doc['version']) <= version:
                            if store._items.isValue(view, version, uItem,
                                                    UUID(doc['value'])):
                                yield uItem, UUID(doc['attribute'])

        return _iterator()
Exemplo n.º 56
0
    def testBoost(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse("term^2")
        self.assertEqual("term^2.0", q.toString("field"))
Exemplo n.º 57
0
    'robespierre', 'danton', 'xvi'
]  #, 'marat', 'mirabeau', 'antoinette', 'fayette', 'tyran']#, 'égalité'.decode('utf-8'), 'fraternité'.decode('utf-8'), 'révolution'.decode('utf-8'), 'salut', 'necker', 'napoleon', 'monarchie', 'aristocratie', 'hébert'.decode('utf-8'), 'gironde', 'jacobins', 'feuillants', 'royalistes','royaliste', 'guillotine', 'bastille', 'versailles', 'tuilleries', 'paume', 'constitution', 'etats', 'citoyen', 'democratie']

initVM()
# Get handle to index directory
directory = SimpleFSDirectory(File(STORE_DIR))
# Creates a searcher searching the provided index.
ireader = IndexReader.open(directory, True)
# Implements search over a single IndexReader.
# Use a single instance and use it across queries
# to improve performance.
searcher = IndexSearcher(ireader)
# Get the analyzer
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
# Constructs a query parser. We specify what field to search into.
queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

appearance_dict = {}
for TERM in term_list:
    print 'Searching for: "' + TERM + '"'
    # Create the query
    query = queryParser.parse(TERM)

    # Run the query and get documents that contain the term
    docs_containing_term = searcher.search(query, ireader.numDocs())

    docs = []

    print 'Found ' + str(len(docs_containing_term.scoreDocs)
                         ) + ' documents with the term "' + TERM + '".'
    #hits = searcher.search(query, 1)
Exemplo n.º 58
0
    # Get handle to index directory
    directory = SimpleFSDirectory(File(STORE_DIR))

    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # Constructs a query parser.
    queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

    # Create a query
    query = queryParser.parse(QUERY_STRING)

    topDocs = searcher.search(query, 50)

    # Get top hits
    scoreDocs = topDocs.scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    HighlightFormatter = SimpleHTMLFormatter()
    query_score = QueryScorer (query)

    highlighter = Highlighter(HighlightFormatter, query_score)
Exemplo n.º 59
0
def process_query_param(param):
    """
    Escapes and lowercases all query params for searching in the lucene index.
    """
    processed_param = QueryParser.escape(param)
    return processed_param.lower()
Exemplo n.º 60
0
def main(indexDir, inputDir):
    """Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index"""
    lucene.initVM()

    # Open index
    logger.info("Opening Lucene index [%s]..." % indexDir)
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT)
    searcher = IndexSearcher(dir)

    # Search documents
    onlyfiles = [
        f for f in listdir(inputDir)
        if isfile(join(inputDir, f)) and f.endswith('.json')
    ]
    for f in onlyfiles:
        json_data = open(inputDir + '/' + f)
        data = json.load(json_data)
        # The results collected after comparison
        results = list()

        journal_code = f.split('.')[0]

        for entry in data:
            url = entry['url']
            date = entry['date']
            title = entry['title']

            logger.debug("Processing URL [%s] date [%s] - [%s]" %
                         (url, date, title))

            tt = nltk.word_tokenize(title)
            tokens = []
            for t in tt:
                tokens.append(t.lower())

            entry['similars'] = list()

            for token in tokens:
                q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % (
                    token, date, journal_code, url)
                query = QueryParser(Version.LUCENE_CURRENT, "title",
                                    analyzer).parse(q)
                hits = searcher.search(query, MAX_HITS)

                logger.debug("Found %d document(s) that matched query '%s':" %
                             (hits.totalHits, q))

                for hit in hits.scoreDocs:
                    doc = searcher.doc(hit.doc)
                    logger.debug(doc)
                    entry['similars'].append({
                        'token': token,
                        'url': doc.get('url'),
                        'title': doc.get('title')
                    })

            results.append(entry)
        json_data.close()

        print """<html>
	<body>
	<table><thead>
	<tr>
	<th>Jornal</th><th>Data</th><th>T&iacute;tulo</th><th>URL</th><th>Not&iacute;cias semelhantes</th>
	</tr>
	</thead>
	<tbody>
	"""
        for entry in results:
            similars = entry['similars']
            similars_text = '<ul>'
            for s in similars:
                similars_text += '<li>[%s] [%s] [%s]</li>' % (
                    s['token'].encode('iso-8859-1', 'ignore'),
                    s['title'].encode('iso-8859-1', 'ignore'), s['url'].encode(
                        'iso-8859-1', 'ignore'))
            similars_text += '</ul>'
            print """<tr>
	<td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>
	</tr>
	""" % (journal_code, entry['date'].encode('iso-8859-1', 'ignore'),
            entry['title'].encode('iso-8859-1', 'ignore'), entry['url'].encode(
            'iso-8859-1', 'ignore'), similars_text)